commit a7cabe9585c64f4697ea89eacaa92a1a417655da
from: Aleksey Ryndin <continue@to.any-key.press>
date: Wed Sep 04 11:04:15 2024 UTC

Fix: render HTML to gmi

commit - 4aa3bfc3170cfbf70255585b2420a734c736fa35
commit + a7cabe9585c64f4697ea89eacaa92a1a417655da
blob - 4ba6f93c62fb3cf70232fbc0b215de58b8418205
blob + e36edd1f1880ae160310438f05d2eee1fbe51932
--- lonk.py
+++ lonk.py
@@ -1,7 +1,8 @@
 from json import loads as json_loads
 from pathlib import Path
-from urllib.parse import urlencode, urlunsplit
+from urllib.parse import urlencode, urlunsplit, urljoin
 from urllib.request import urlopen
+from html.parser import HTMLParser
 
 
 _TOKEN = (Path(__file__).parent / ".local" / "token").read_text(encoding="utf8")
@@ -36,11 +37,129 @@ for honk in reversed(home["honks"]):
         collected[convoy] = honk
 
 
+class _BaseTag:
+    def __init__(self, tag, attrs):
+        self.tag = tag
+
+    def on_data(self, data):
+        raise NotImplementedError()
+
+    def flush(self):
+        raise NotImplementedError()
+
+
+class ParagraphTag(_BaseTag):
+    def __init__(self, tag, attrs):
+        super().__init__(tag, attrs)
+        self.content = []
+
+    def on_data(self, data):
+        self.content.append(data.strip())
+
+    def flush(self):
+        rv = " ".join(" ".join(data.split()) for data in self.content if data)
+        self.content = []
+        return rv + "\n" if rv else ""
+
+
+class LinkTag(_BaseTag):
+    def __init__(self, tag, attrs):
+        super().__init__(tag, attrs)
+        self.href = None
+        self.content = []
+        for attr_name, attr_value in attrs:
+            if attr_name.lower() == "href":
+                self.href = urljoin(_build_url(), attr_value)
+
+    def on_data(self, data):
+        self.content.append(data.strip())
+
+    def flush(self):
+        rv = ""
+        if self.href:
+            text = " ".join(" ".join(data.split()) for data in self.content if data)
+            rv = f"=> {self.href} {text}"
+        self.href = None
+        self.content = []
+        return rv
+
+
+class LitItemTag(ParagraphTag):
+    def flush(self):
+        content = super().flush()
+        return f"* {content}" if content else ""
+
+
+class QuoteTag(ParagraphTag):
+    def flush(self):
+        content = super().flush()
+        return f"> {content}" if content else ""
+
+
+class HtmlToGmi(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.gmi_text = []
+        self.stack = []
+
+    def feed(self, html_text):
+        super().feed(html_text)
+        while self.stack:
+            self.gmi_text.append(self.stack.pop().flush())
+        return "\n".join(gmi_text for gmi_text in self.gmi_text if gmi_text)
+
+    def handle_starttag(self, tag, attrs):
+        # print(f" > {tag=}")
+
+        if tag in {"b", "strong", "i", "em", "mark", "small", "del", "ins", "sub", "sup", "u", "rb", "ruby", "rp", "rt"}:
+            # skip text formatting
+            return
+
+        if self.stack:
+            self.gmi_text.append(self.stack[-1].flush())
+
+        if tag in {"p", "br", "ul"}:
+            pass
+        elif tag == "li":
+            self.stack.append(LitItemTag(tag, attrs))
+        elif tag == "blockquote":
+            self.stack.append(QuoteTag(tag, attrs))
+        elif tag == "a":
+            if self.stack:
+                self.gmi_text.append(self.stack[-1].flush())
+            self.stack.append(LinkTag(tag, attrs))
+        elif tag == "img":
+            title = "" 
+            src = ""
+            for attr_name, attr_value in attrs:
+                if attr_name.lower() == "title":
+                    title = attr_value
+                elif attr_name.lower() == "src":
+                    src = urljoin(_build_url(), attr_value)
+            if src:
+                self.gmi_text.append(f"=> {src} {title}")
+        else:
+            raise NotImplementedError(tag, attrs)
+
+    def handle_data(self, data):
+        # print(f" . {data=}")
+        if not self.stack:
+            self.stack.append(ParagraphTag("tag", []))
+        self.stack[-1].on_data(data)
+
+    def handle_endtag(self, tag):
+        # print(f" < {tag=}")
+        if self.stack:
+            self.gmi_text.append(self.stack[-1].flush())
+            if tag == self.stack[-1].tag:
+                self.stack.pop()
+
+
 def _format_honk(honk):
     assert honk["Format"] == "html", honk
     assert honk["Noise"] == honk["HTML"], honk
     firts_line = f'## From: {honk.get("Oondle") or honk["Handle"]}, {honk["Date"]}'
-    lines = [firts_line, f'=> {honk["XID"]}', honk["Noise"]]
+    lines = [firts_line, f'=> {honk["XID"]}', HtmlToGmi().feed(honk["Noise"])]
     for donk in (honk.get("Donks") or []):
         lines.append(f'=> {donk["URL"]}')