Commit Diff


commit - 5d0d11f445413917dffb40a2a259af37d9ccf8e5
commit + 38a989cbfcbadccc35a22dd44b2d34fe7d253a40
blob - 2d673a073775178ed19a8a9df063e0a016db7f0c
blob + d88ba135a02b04f994d67703ccb4f194096ffa9d
--- avk.py
+++ avk.py
@@ -120,6 +120,9 @@ def _parse_date(value: str):
 class Text:
     data: str
 
+    def build_xhtml(self, div):
+        ET.SubElement(div, "p").text = self.data
+
 @dataclass(kw_only=True, frozen=True)
 class Link:
     data: str
@@ -137,18 +140,27 @@ class Link:
 
         return cls(data=data, url=href)
 
+    def build_xhtml(self, div):
+        a = ET.SubElement(div, "a")
+        a.attrib.update(href=self.url)
+        a.text = self.data
+
 @dataclass(kw_only=True, frozen=True)
 class LineBreak:
-    pass
+    def build_xhtml(self, div):
+        ET.SubElement(div, "br")
 
 @dataclass(kw_only=True, frozen=True)
 class HorizontalRule:
-    pass
+    def build_xhtml(self, div):
+        ET.SubElement(div, "hr")
 
 @dataclass(kw_only=True, frozen=True)
 class Image:
     url: str
 
+    def build_xhtml(self, div):
+        ET.SubElement(div, "img").attrib.update(src=self.url)
 
 @dataclass(kw_only=True)
 class Entry:
@@ -156,24 +168,57 @@ class Entry:
     url: str = ""
     content: list[Text | Link | LineBreak | HorizontalRule | Image] = field(default_factory=list)
 
+    def build_xml(self, xml_feed):
+        xml_entry = ET.SubElement(xml_feed, "entry")
+        ET.SubElement(xml_entry, "id").text = self.url
+        ET.SubElement(xml_entry, "updated").text = self.updated.isoformat()
 
+        raw_text = " ".join(i.data if hasattr(i, "data") else "\n" for i in self.content)
+        lines = raw_text.splitlines()
+        title = lines[0] if lines else raw_text
+        pos = raw_text.find(". ")
+        if pos > 0:
+            title = title[:pos]
+        if "!" in title:
+            title = title.split("!")[0] + "!"
+        elif "?" in title:
+            title = title.split("?")[0] + "?"
+        ET.SubElement(xml_entry, "title").text = title.strip() or self.url
+
+        ET.SubElement(xml_entry, "link").attrib.update(rel="alternate", href=self.url)
+        content = ET.SubElement(xml_entry, "content")
+        content.attrib.update(type="xhtml")
+        div = ET.SubElement(content, "div")
+        div.attrib.update(xmlns="http://www.w3.org/1999/xhtml")
+        for i in self.content:
+            i.build_xhtml(div)
+
+
 @dataclass(kw_only=True)
 class Feed:
     url: str
     _title: str  = ""
     entries: list[Entry] = field(default_factory=list)
 
-    def get_title(self) -> str:
-        return self._title or self.url
+    def is_title_set(self) -> bool:
+        return self._title != ""
 
-    def is_title_filled(self) -> bool:
-        return self._title
-
     def set_title(self, title: str):
         self._title = title
 
-    def get_updated(self) -> datetime:
-        return max(entry.updated for entry in self.entries)
+    def build_xml_tree(self, self_href):
+        xml_feed = ET.Element("feed")
+        xml_feed.attrib.update(xmlns="http://www.w3.org/2005/Atom")
+        ET.SubElement(xml_feed, "title").text = self._title or self.url
+        ET.SubElement(xml_feed, "id").text = self.url
+        updated = max(entry.updated for entry in self.entries) if self.entries else _get_now()
+        ET.SubElement(xml_feed, "updated").text = updated.isoformat()
+        generator = ET.SubElement(xml_feed, "generator")
+        generator.attrib.update(uri="https://got.any-key.press/?action=summary&path=avk.git", version=_AVK_VERSION)
+        generator.text = "Anonymous Atom feed generator (by VK)"
+        for entry in self.entries:
+            entry.build_xml(xml_feed)
+        return ET.ElementTree(xml_feed)
 
 
 class _HTMLParser(HTMLParser):
@@ -181,7 +226,7 @@ class _HTMLParser(HTMLParser):
         super().__init__(*args, **kwargs)
         self.__base_url = base_url
         self.__debug_print = debug_print
-        self.__stack = []
+        self.__divs = []
         self.__feed = Feed(url=base_url)
         self.__entry = Entry()
         self.__current_tag = None
@@ -192,11 +237,12 @@ class _HTMLParser(HTMLParser):
         self.__current_attrs = {pair[0]: pair[1] for pair in attrs}
 
         def _append():
-            self.__stack.append((tag, self.__current_attrs.get("class", "").strip()))
-            if self.__debug_print:
-                print(f"> {'.'.join(pair[0] for pair in self.__stack)} {attrs=}")
+            if tag == "div":
+                self.__divs.append(self.__current_attrs.get("class", "").strip())
+                if self.__debug_print:
+                    print(f"> {'|'.join(div for div in self.__divs)} {attrs=}")
 
-        if not self.__stack:
+        if not self.__divs:
             if tag == "div" and self.__current_attrs.get("class", "").startswith("wall_item "):
                 self.__entry = Entry()
                 _append()
@@ -214,16 +260,15 @@ class _HTMLParser(HTMLParser):
                             urljoin(self.__base_url, self.__current_attrs["href"])
                         )
                     )
-                elif self.__stack[-1][0] == "div":
-                    if self.__stack[-1][1].startswith("thumb_map") or self.__stack[-1][1] == "AttachmentsList":
-                        self.__entry.content.append(LineBreak())
-                        self.__entry.content.append(
-                            Link.from_(
-                                self.__base_url,
-                                self.__current_attrs,
-                                urljoin(self.__base_url, self.__current_attrs["href"])
-                            )
+                elif self.__divs[-1].startswith("thumb_map") or self.__divs[-1] == "AttachmentsList":
+                    self.__entry.content.append(LineBreak())
+                    self.__entry.content.append(
+                        Link.from_(
+                            self.__base_url,
+                            self.__current_attrs,
+                            urljoin(self.__base_url, self.__current_attrs["href"])
                         )
+                    )
             elif tag == "br":
                 self.__entry.content.append(LineBreak())
             elif tag == "img" and self.__current_attrs.get("class") in {"PhotoPrimaryAttachment__imageElement", "MediaGrid__imageElement"}:
@@ -247,30 +292,24 @@ class _HTMLParser(HTMLParser):
             _append()
 
     def handle_endtag(self, tag):
-        if self.__stack:
-            while self.__stack:
-                stack_tag, _ = self.__stack.pop() 
-                if stack_tag == tag:
-                    break
-            if not self.__stack:
+        if tag == "div" and self.__divs:
+            self.__divs.pop()
+            if not self.__divs:
                 self.__feed.entries.append(self.__entry)
         self.__current_tag = None
         self.__current_attrs = {}
 
     def handle_data(self, data):
-        if self.__current_tag == "title" and not self.__feed.is_title_filled():
+        if self.__current_tag == "title" and not self.__feed.is_title_set():
             self.__feed.set_title(data)
-        if self.__stack and data.strip():
-            if self.__stack[-1:] == [("div", "pi_text")] or self.__stack[-2:] == [("div", "pi_text"), ("span", "")]:
-                self._on_text(data)
-            elif self.__stack[-1:] == [("img", "emoji")] or self.__stack[-2:] ==[("img", "emoji"), ("span", "")]:
-                self._on_text(data)
-            elif self.__stack[-1][0] in {"a", "img"}:
-                if self.__stack[-2:-1] == [("div", "pi_text")] or self.__stack[-3:-1] == [("div", "pi_text"), ("span", "")]:
-                    title = self.__current_attrs.get("title", "")
+        if self.__divs and data.strip():
+            if self.__divs[-1] == "pi_text":
+                if self.__current_tag == "a":
                     self.__entry.content.append(Link.from_(self.__base_url, self.__current_attrs, data))
+                elif not (self.__current_tag == "span" and self.__current_attrs.get("class" == "PostTextMore__content")):
+                    self._on_text(data)
             if self.__debug_print:
-                print(f"{'|'.join(pair[0] + '.' + pair[1] for pair in self.__stack)} {data=}")
+                print(f"{'|'.join(div for div in self.__divs)} <{self.__current_tag}> {self.__current_attrs} {data=}")
 
     def feed(self, *args, **kwargs):
         super().feed(*args, **kwargs)
@@ -305,23 +344,9 @@ class _HTTPRequestHandler(BaseHTTPRequestHandler):
             name = query.get("name", [None])[0]
             if name:
                 base_url = urlunparse(("https", "m.vk.com", name, "", "", ""))
-                feed = _HTMLParser(base_url).feed(_get_html(base_url))
+                xml_tree = _HTMLParser(base_url).feed(_get_html(base_url)).build_xml_tree(self.path)
+                return lambda: self._return_xml(xml_tree)
 
-                xml_tree = ET.Element("feed")
-                xml_tree.attrib.update(xmlns="http://www.w3.org/2005/Atom")
-                ET.SubElement(xml_tree, "title").text = feed.get_title()
-                ET.SubElement(xml_tree, "id").text = feed.url
-                ET.SubElement(xml_tree, "link").attrib.update(rel="self", href=self.path)
-                ET.SubElement(xml_tree, "updated").text = feed.get_updated().isoformat()
-                generator = ET.SubElement(xml_tree, "generator")
-                generator.attrib.update(
-                    uri="https://got.any-key.press/?action=summary&path=avk.git",
-                    version=_AVK_VERSION
-                )
-                generator.text = "Anonymous Atom feed generator (by VK)"
-
-                return lambda: self._return_xml(ET.ElementTree(xml_tree))
-
         return self._return_not_found
 
     def _return_xml(self, xml_tree):
@@ -352,13 +377,14 @@ def _run_http_server(address: str, port: int):
 
 
 def _run_dump(name: str):
-    base_url = urlunparse(("https", "m.vk.com", name, "", "", ""))
     print("=" * 80)
+    base_url = urlunparse(("https", "m.vk.com", name, "", "", ""))
     print(base_url)
-    feed = _HTMLParser(base_url, debug_print=True).feed(_get_html(base_url))
-    print("=" * 80)
-    print("\n".join(str(entry) for entry in feed.entries))
-    print(base_url)
+    print("-" * 80)
+    xml_tree = _HTMLParser(base_url, debug_print=True).feed(_get_html(base_url)).build_xml_tree("/?name=")
+    print("-" * 80)
+    ET.indent(xml_tree)
+    print(ET.tostring(xml_tree.getroot(), encoding="unicode"))
 
 
 if __name__ == "__main__":