commit a7cabe9585c64f4697ea89eacaa92a1a417655da from: Aleksey Ryndin date: Wed Sep 04 11:04:15 2024 UTC Fix: render HTML to gmi commit - 4aa3bfc3170cfbf70255585b2420a734c736fa35 commit + a7cabe9585c64f4697ea89eacaa92a1a417655da blob - 4ba6f93c62fb3cf70232fbc0b215de58b8418205 blob + e36edd1f1880ae160310438f05d2eee1fbe51932 --- lonk.py +++ lonk.py @@ -1,7 +1,8 @@ from json import loads as json_loads from pathlib import Path -from urllib.parse import urlencode, urlunsplit +from urllib.parse import urlencode, urlunsplit, urljoin from urllib.request import urlopen +from html.parser import HTMLParser _TOKEN = (Path(__file__).parent / ".local" / "token").read_text(encoding="utf8") @@ -36,11 +37,129 @@ for honk in reversed(home["honks"]): collected[convoy] = honk +class _BaseTag: + def __init__(self, tag, attrs): + self.tag = tag + + def on_data(self, data): + raise NotImplementedError() + + def flush(self): + raise NotImplementedError() + + +class ParagraphTag(_BaseTag): + def __init__(self, tag, attrs): + super().__init__(tag, attrs) + self.content = [] + + def on_data(self, data): + self.content.append(data.strip()) + + def flush(self): + rv = " ".join(" ".join(data.split()) for data in self.content if data) + self.content = [] + return rv + "\n" if rv else "" + + +class LinkTag(_BaseTag): + def __init__(self, tag, attrs): + super().__init__(tag, attrs) + self.href = None + self.content = [] + for attr_name, attr_value in attrs: + if attr_name.lower() == "href": + self.href = urljoin(_build_url(), attr_value) + + def on_data(self, data): + self.content.append(data.strip()) + + def flush(self): + rv = "" + if self.href: + text = " ".join(" ".join(data.split()) for data in self.content if data) + rv = f"=> {self.href} {text}" + self.href = None + self.content = [] + return rv + + +class LitItemTag(ParagraphTag): + def flush(self): + content = super().flush() + return f"* {content}" if content else "" + + +class QuoteTag(ParagraphTag): + def flush(self): + content = super().flush() + return f"> {content}" if content else "" + + +class HtmlToGmi(HTMLParser): + def __init__(self): + super().__init__() + self.gmi_text = [] + self.stack = [] + + def feed(self, html_text): + super().feed(html_text) + while self.stack: + self.gmi_text.append(self.stack.pop().flush()) + return "\n".join(gmi_text for gmi_text in self.gmi_text if gmi_text) + + def handle_starttag(self, tag, attrs): + # print(f" > {tag=}") + + if tag in {"b", "strong", "i", "em", "mark", "small", "del", "ins", "sub", "sup", "u", "rb", "ruby", "rp", "rt"}: + # skip text formatting + return + + if self.stack: + self.gmi_text.append(self.stack[-1].flush()) + + if tag in {"p", "br", "ul"}: + pass + elif tag == "li": + self.stack.append(LitItemTag(tag, attrs)) + elif tag == "blockquote": + self.stack.append(QuoteTag(tag, attrs)) + elif tag == "a": + if self.stack: + self.gmi_text.append(self.stack[-1].flush()) + self.stack.append(LinkTag(tag, attrs)) + elif tag == "img": + title = "" + src = "" + for attr_name, attr_value in attrs: + if attr_name.lower() == "title": + title = attr_value + elif attr_name.lower() == "src": + src = urljoin(_build_url(), attr_value) + if src: + self.gmi_text.append(f"=> {src} {title}") + else: + raise NotImplementedError(tag, attrs) + + def handle_data(self, data): + # print(f" . {data=}") + if not self.stack: + self.stack.append(ParagraphTag("tag", [])) + self.stack[-1].on_data(data) + + def handle_endtag(self, tag): + # print(f" < {tag=}") + if self.stack: + self.gmi_text.append(self.stack[-1].flush()) + if tag == self.stack[-1].tag: + self.stack.pop() + + def _format_honk(honk): assert honk["Format"] == "html", honk assert honk["Noise"] == honk["HTML"], honk firts_line = f'## From: {honk.get("Oondle") or honk["Handle"]}, {honk["Date"]}' - lines = [firts_line, f'=> {honk["XID"]}', honk["Noise"]] + lines = [firts_line, f'=> {honk["XID"]}', HtmlToGmi().feed(honk["Noise"])] for donk in (honk.get("Donks") or []): lines.append(f'=> {donk["URL"]}')