commit 38a989cbfcbadccc35a22dd44b2d34fe7d253a40 from: Aleksey Ryndin date: Fri Jan 17 15:24:04 2025 UTC Add first worked version commit - 5d0d11f445413917dffb40a2a259af37d9ccf8e5 commit + 38a989cbfcbadccc35a22dd44b2d34fe7d253a40 blob - 2d673a073775178ed19a8a9df063e0a016db7f0c blob + d88ba135a02b04f994d67703ccb4f194096ffa9d --- avk.py +++ avk.py @@ -120,6 +120,9 @@ def _parse_date(value: str): class Text: data: str + def build_xhtml(self, div): + ET.SubElement(div, "p").text = self.data + @dataclass(kw_only=True, frozen=True) class Link: data: str @@ -137,18 +140,27 @@ class Link: return cls(data=data, url=href) + def build_xhtml(self, div): + a = ET.SubElement(div, "a") + a.attrib.update(href=self.url) + a.text = self.data + @dataclass(kw_only=True, frozen=True) class LineBreak: - pass + def build_xhtml(self, div): + ET.SubElement(div, "br") @dataclass(kw_only=True, frozen=True) class HorizontalRule: - pass + def build_xhtml(self, div): + ET.SubElement(div, "hr") @dataclass(kw_only=True, frozen=True) class Image: url: str + def build_xhtml(self, div): + ET.SubElement(div, "img").attrib.update(src=self.url) @dataclass(kw_only=True) class Entry: @@ -156,24 +168,57 @@ class Entry: url: str = "" content: list[Text | Link | LineBreak | HorizontalRule | Image] = field(default_factory=list) + def build_xml(self, xml_feed): + xml_entry = ET.SubElement(xml_feed, "entry") + ET.SubElement(xml_entry, "id").text = self.url + ET.SubElement(xml_entry, "updated").text = self.updated.isoformat() + raw_text = " ".join(i.data if hasattr(i, "data") else "\n" for i in self.content) + lines = raw_text.splitlines() + title = lines[0] if lines else raw_text + pos = raw_text.find(". ") + if pos > 0: + title = title[:pos] + if "!" in title: + title = title.split("!")[0] + "!" + elif "?" in title: + title = title.split("?")[0] + "?" + ET.SubElement(xml_entry, "title").text = title.strip() or self.url + + ET.SubElement(xml_entry, "link").attrib.update(rel="alternate", href=self.url) + content = ET.SubElement(xml_entry, "content") + content.attrib.update(type="xhtml") + div = ET.SubElement(content, "div") + div.attrib.update(xmlns="http://www.w3.org/1999/xhtml") + for i in self.content: + i.build_xhtml(div) + + @dataclass(kw_only=True) class Feed: url: str _title: str = "" entries: list[Entry] = field(default_factory=list) - def get_title(self) -> str: - return self._title or self.url + def is_title_set(self) -> bool: + return self._title != "" - def is_title_filled(self) -> bool: - return self._title - def set_title(self, title: str): self._title = title - def get_updated(self) -> datetime: - return max(entry.updated for entry in self.entries) + def build_xml_tree(self, self_href): + xml_feed = ET.Element("feed") + xml_feed.attrib.update(xmlns="http://www.w3.org/2005/Atom") + ET.SubElement(xml_feed, "title").text = self._title or self.url + ET.SubElement(xml_feed, "id").text = self.url + updated = max(entry.updated for entry in self.entries) if self.entries else _get_now() + ET.SubElement(xml_feed, "updated").text = updated.isoformat() + generator = ET.SubElement(xml_feed, "generator") + generator.attrib.update(uri="https://got.any-key.press/?action=summary&path=avk.git", version=_AVK_VERSION) + generator.text = "Anonymous Atom feed generator (by VK)" + for entry in self.entries: + entry.build_xml(xml_feed) + return ET.ElementTree(xml_feed) class _HTMLParser(HTMLParser): @@ -181,7 +226,7 @@ class _HTMLParser(HTMLParser): super().__init__(*args, **kwargs) self.__base_url = base_url self.__debug_print = debug_print - self.__stack = [] + self.__divs = [] self.__feed = Feed(url=base_url) self.__entry = Entry() self.__current_tag = None @@ -192,11 +237,12 @@ class _HTMLParser(HTMLParser): self.__current_attrs = {pair[0]: pair[1] for pair in attrs} def _append(): - self.__stack.append((tag, self.__current_attrs.get("class", "").strip())) - if self.__debug_print: - print(f"> {'.'.join(pair[0] for pair in self.__stack)} {attrs=}") + if tag == "div": + self.__divs.append(self.__current_attrs.get("class", "").strip()) + if self.__debug_print: + print(f"> {'|'.join(div for div in self.__divs)} {attrs=}") - if not self.__stack: + if not self.__divs: if tag == "div" and self.__current_attrs.get("class", "").startswith("wall_item "): self.__entry = Entry() _append() @@ -214,16 +260,15 @@ class _HTMLParser(HTMLParser): urljoin(self.__base_url, self.__current_attrs["href"]) ) ) - elif self.__stack[-1][0] == "div": - if self.__stack[-1][1].startswith("thumb_map") or self.__stack[-1][1] == "AttachmentsList": - self.__entry.content.append(LineBreak()) - self.__entry.content.append( - Link.from_( - self.__base_url, - self.__current_attrs, - urljoin(self.__base_url, self.__current_attrs["href"]) - ) + elif self.__divs[-1].startswith("thumb_map") or self.__divs[-1] == "AttachmentsList": + self.__entry.content.append(LineBreak()) + self.__entry.content.append( + Link.from_( + self.__base_url, + self.__current_attrs, + urljoin(self.__base_url, self.__current_attrs["href"]) ) + ) elif tag == "br": self.__entry.content.append(LineBreak()) elif tag == "img" and self.__current_attrs.get("class") in {"PhotoPrimaryAttachment__imageElement", "MediaGrid__imageElement"}: @@ -247,30 +292,24 @@ class _HTMLParser(HTMLParser): _append() def handle_endtag(self, tag): - if self.__stack: - while self.__stack: - stack_tag, _ = self.__stack.pop() - if stack_tag == tag: - break - if not self.__stack: + if tag == "div" and self.__divs: + self.__divs.pop() + if not self.__divs: self.__feed.entries.append(self.__entry) self.__current_tag = None self.__current_attrs = {} def handle_data(self, data): - if self.__current_tag == "title" and not self.__feed.is_title_filled(): + if self.__current_tag == "title" and not self.__feed.is_title_set(): self.__feed.set_title(data) - if self.__stack and data.strip(): - if self.__stack[-1:] == [("div", "pi_text")] or self.__stack[-2:] == [("div", "pi_text"), ("span", "")]: - self._on_text(data) - elif self.__stack[-1:] == [("img", "emoji")] or self.__stack[-2:] ==[("img", "emoji"), ("span", "")]: - self._on_text(data) - elif self.__stack[-1][0] in {"a", "img"}: - if self.__stack[-2:-1] == [("div", "pi_text")] or self.__stack[-3:-1] == [("div", "pi_text"), ("span", "")]: - title = self.__current_attrs.get("title", "") + if self.__divs and data.strip(): + if self.__divs[-1] == "pi_text": + if self.__current_tag == "a": self.__entry.content.append(Link.from_(self.__base_url, self.__current_attrs, data)) + elif not (self.__current_tag == "span" and self.__current_attrs.get("class" == "PostTextMore__content")): + self._on_text(data) if self.__debug_print: - print(f"{'|'.join(pair[0] + '.' + pair[1] for pair in self.__stack)} {data=}") + print(f"{'|'.join(div for div in self.__divs)} <{self.__current_tag}> {self.__current_attrs} {data=}") def feed(self, *args, **kwargs): super().feed(*args, **kwargs) @@ -305,23 +344,9 @@ class _HTTPRequestHandler(BaseHTTPRequestHandler): name = query.get("name", [None])[0] if name: base_url = urlunparse(("https", "m.vk.com", name, "", "", "")) - feed = _HTMLParser(base_url).feed(_get_html(base_url)) + xml_tree = _HTMLParser(base_url).feed(_get_html(base_url)).build_xml_tree(self.path) + return lambda: self._return_xml(xml_tree) - xml_tree = ET.Element("feed") - xml_tree.attrib.update(xmlns="http://www.w3.org/2005/Atom") - ET.SubElement(xml_tree, "title").text = feed.get_title() - ET.SubElement(xml_tree, "id").text = feed.url - ET.SubElement(xml_tree, "link").attrib.update(rel="self", href=self.path) - ET.SubElement(xml_tree, "updated").text = feed.get_updated().isoformat() - generator = ET.SubElement(xml_tree, "generator") - generator.attrib.update( - uri="https://got.any-key.press/?action=summary&path=avk.git", - version=_AVK_VERSION - ) - generator.text = "Anonymous Atom feed generator (by VK)" - - return lambda: self._return_xml(ET.ElementTree(xml_tree)) - return self._return_not_found def _return_xml(self, xml_tree): @@ -352,13 +377,14 @@ def _run_http_server(address: str, port: int): def _run_dump(name: str): - base_url = urlunparse(("https", "m.vk.com", name, "", "", "")) print("=" * 80) + base_url = urlunparse(("https", "m.vk.com", name, "", "", "")) print(base_url) - feed = _HTMLParser(base_url, debug_print=True).feed(_get_html(base_url)) - print("=" * 80) - print("\n".join(str(entry) for entry in feed.entries)) - print(base_url) + print("-" * 80) + xml_tree = _HTMLParser(base_url, debug_print=True).feed(_get_html(base_url)).build_xml_tree("/?name=") + print("-" * 80) + ET.indent(xml_tree) + print(ET.tostring(xml_tree.getroot(), encoding="unicode")) if __name__ == "__main__":