commit - 5d0d11f445413917dffb40a2a259af37d9ccf8e5
commit + 38a989cbfcbadccc35a22dd44b2d34fe7d253a40
blob - 2d673a073775178ed19a8a9df063e0a016db7f0c
blob + d88ba135a02b04f994d67703ccb4f194096ffa9d
--- avk.py
+++ avk.py
class Text:
data: str
+ def build_xhtml(self, div):
+ ET.SubElement(div, "p").text = self.data
+
@dataclass(kw_only=True, frozen=True)
class Link:
data: str
return cls(data=data, url=href)
+ def build_xhtml(self, div):
+ a = ET.SubElement(div, "a")
+ a.attrib.update(href=self.url)
+ a.text = self.data
+
@dataclass(kw_only=True, frozen=True)
class LineBreak:
- pass
+ def build_xhtml(self, div):
+ ET.SubElement(div, "br")
@dataclass(kw_only=True, frozen=True)
class HorizontalRule:
- pass
+ def build_xhtml(self, div):
+ ET.SubElement(div, "hr")
@dataclass(kw_only=True, frozen=True)
class Image:
url: str
+ def build_xhtml(self, div):
+ ET.SubElement(div, "img").attrib.update(src=self.url)
@dataclass(kw_only=True)
class Entry:
url: str = ""
content: list[Text | Link | LineBreak | HorizontalRule | Image] = field(default_factory=list)
+ def build_xml(self, xml_feed):
+ xml_entry = ET.SubElement(xml_feed, "entry")
+ ET.SubElement(xml_entry, "id").text = self.url
+ ET.SubElement(xml_entry, "updated").text = self.updated.isoformat()
+ raw_text = " ".join(i.data if hasattr(i, "data") else "\n" for i in self.content)
+ lines = raw_text.splitlines()
+ title = lines[0] if lines else raw_text
+ pos = raw_text.find(". ")
+ if pos > 0:
+ title = title[:pos]
+ if "!" in title:
+ title = title.split("!")[0] + "!"
+ elif "?" in title:
+ title = title.split("?")[0] + "?"
+ ET.SubElement(xml_entry, "title").text = title.strip() or self.url
+
+ ET.SubElement(xml_entry, "link").attrib.update(rel="alternate", href=self.url)
+ content = ET.SubElement(xml_entry, "content")
+ content.attrib.update(type="xhtml")
+ div = ET.SubElement(content, "div")
+ div.attrib.update(xmlns="http://www.w3.org/1999/xhtml")
+ for i in self.content:
+ i.build_xhtml(div)
+
+
@dataclass(kw_only=True)
class Feed:
url: str
_title: str = ""
entries: list[Entry] = field(default_factory=list)
- def get_title(self) -> str:
- return self._title or self.url
+ def is_title_set(self) -> bool:
+ return self._title != ""
- def is_title_filled(self) -> bool:
- return self._title
-
def set_title(self, title: str):
self._title = title
- def get_updated(self) -> datetime:
- return max(entry.updated for entry in self.entries)
+ def build_xml_tree(self, self_href):
+ xml_feed = ET.Element("feed")
+ xml_feed.attrib.update(xmlns="http://www.w3.org/2005/Atom")
+ ET.SubElement(xml_feed, "title").text = self._title or self.url
+ ET.SubElement(xml_feed, "id").text = self.url
+ updated = max(entry.updated for entry in self.entries) if self.entries else _get_now()
+ ET.SubElement(xml_feed, "updated").text = updated.isoformat()
+ generator = ET.SubElement(xml_feed, "generator")
+ generator.attrib.update(uri="https://got.any-key.press/?action=summary&path=avk.git", version=_AVK_VERSION)
+ generator.text = "Anonymous Atom feed generator (by VK)"
+ for entry in self.entries:
+ entry.build_xml(xml_feed)
+ return ET.ElementTree(xml_feed)
class _HTMLParser(HTMLParser):
super().__init__(*args, **kwargs)
self.__base_url = base_url
self.__debug_print = debug_print
- self.__stack = []
+ self.__divs = []
self.__feed = Feed(url=base_url)
self.__entry = Entry()
self.__current_tag = None
self.__current_attrs = {pair[0]: pair[1] for pair in attrs}
def _append():
- self.__stack.append((tag, self.__current_attrs.get("class", "").strip()))
- if self.__debug_print:
- print(f"> {'.'.join(pair[0] for pair in self.__stack)} {attrs=}")
+ if tag == "div":
+ self.__divs.append(self.__current_attrs.get("class", "").strip())
+ if self.__debug_print:
+ print(f"> {'|'.join(div for div in self.__divs)} {attrs=}")
- if not self.__stack:
+ if not self.__divs:
if tag == "div" and self.__current_attrs.get("class", "").startswith("wall_item "):
self.__entry = Entry()
_append()
urljoin(self.__base_url, self.__current_attrs["href"])
)
)
- elif self.__stack[-1][0] == "div":
- if self.__stack[-1][1].startswith("thumb_map") or self.__stack[-1][1] == "AttachmentsList":
- self.__entry.content.append(LineBreak())
- self.__entry.content.append(
- Link.from_(
- self.__base_url,
- self.__current_attrs,
- urljoin(self.__base_url, self.__current_attrs["href"])
- )
+ elif self.__divs[-1].startswith("thumb_map") or self.__divs[-1] == "AttachmentsList":
+ self.__entry.content.append(LineBreak())
+ self.__entry.content.append(
+ Link.from_(
+ self.__base_url,
+ self.__current_attrs,
+ urljoin(self.__base_url, self.__current_attrs["href"])
)
+ )
elif tag == "br":
self.__entry.content.append(LineBreak())
elif tag == "img" and self.__current_attrs.get("class") in {"PhotoPrimaryAttachment__imageElement", "MediaGrid__imageElement"}:
_append()
def handle_endtag(self, tag):
- if self.__stack:
- while self.__stack:
- stack_tag, _ = self.__stack.pop()
- if stack_tag == tag:
- break
- if not self.__stack:
+ if tag == "div" and self.__divs:
+ self.__divs.pop()
+ if not self.__divs:
self.__feed.entries.append(self.__entry)
self.__current_tag = None
self.__current_attrs = {}
def handle_data(self, data):
- if self.__current_tag == "title" and not self.__feed.is_title_filled():
+ if self.__current_tag == "title" and not self.__feed.is_title_set():
self.__feed.set_title(data)
- if self.__stack and data.strip():
- if self.__stack[-1:] == [("div", "pi_text")] or self.__stack[-2:] == [("div", "pi_text"), ("span", "")]:
- self._on_text(data)
- elif self.__stack[-1:] == [("img", "emoji")] or self.__stack[-2:] ==[("img", "emoji"), ("span", "")]:
- self._on_text(data)
- elif self.__stack[-1][0] in {"a", "img"}:
- if self.__stack[-2:-1] == [("div", "pi_text")] or self.__stack[-3:-1] == [("div", "pi_text"), ("span", "")]:
- title = self.__current_attrs.get("title", "")
+ if self.__divs and data.strip():
+ if self.__divs[-1] == "pi_text":
+ if self.__current_tag == "a":
self.__entry.content.append(Link.from_(self.__base_url, self.__current_attrs, data))
+ elif not (self.__current_tag == "span" and self.__current_attrs.get("class" == "PostTextMore__content")):
+ self._on_text(data)
if self.__debug_print:
- print(f"{'|'.join(pair[0] + '.' + pair[1] for pair in self.__stack)} {data=}")
+ print(f"{'|'.join(div for div in self.__divs)} <{self.__current_tag}> {self.__current_attrs} {data=}")
def feed(self, *args, **kwargs):
super().feed(*args, **kwargs)
name = query.get("name", [None])[0]
if name:
base_url = urlunparse(("https", "m.vk.com", name, "", "", ""))
- feed = _HTMLParser(base_url).feed(_get_html(base_url))
+ xml_tree = _HTMLParser(base_url).feed(_get_html(base_url)).build_xml_tree(self.path)
+ return lambda: self._return_xml(xml_tree)
- xml_tree = ET.Element("feed")
- xml_tree.attrib.update(xmlns="http://www.w3.org/2005/Atom")
- ET.SubElement(xml_tree, "title").text = feed.get_title()
- ET.SubElement(xml_tree, "id").text = feed.url
- ET.SubElement(xml_tree, "link").attrib.update(rel="self", href=self.path)
- ET.SubElement(xml_tree, "updated").text = feed.get_updated().isoformat()
- generator = ET.SubElement(xml_tree, "generator")
- generator.attrib.update(
- uri="https://got.any-key.press/?action=summary&path=avk.git",
- version=_AVK_VERSION
- )
- generator.text = "Anonymous Atom feed generator (by VK)"
-
- return lambda: self._return_xml(ET.ElementTree(xml_tree))
-
return self._return_not_found
def _return_xml(self, xml_tree):
def _run_dump(name: str):
- base_url = urlunparse(("https", "m.vk.com", name, "", "", ""))
print("=" * 80)
+ base_url = urlunparse(("https", "m.vk.com", name, "", "", ""))
print(base_url)
- feed = _HTMLParser(base_url, debug_print=True).feed(_get_html(base_url))
- print("=" * 80)
- print("\n".join(str(entry) for entry in feed.entries))
- print(base_url)
+ print("-" * 80)
+ xml_tree = _HTMLParser(base_url, debug_print=True).feed(_get_html(base_url)).build_xml_tree("/?name=")
+ print("-" * 80)
+ ET.indent(xml_tree)
+ print(ET.tostring(xml_tree.getroot(), encoding="unicode"))
if __name__ == "__main__":