commit - 53cff30f688fa4c6de0b531106aa693125ebbfa3
commit + 3ac0752c060d3243f4fabbfb6627e2785a0ebc05
blob - 4a503df4bd8722ca47fe51f3a075e73d1f8e7d7a
blob + c9fff8bed243e98a938761c0d7bfa7e6515938ef
--- avk.py
+++ avk.py
from json import loads as json_loads
-def _get_html(base_url):
+def _get_html(base_url: str):
cookiejar = CookieJar()
opener = build_opener(HTTPCookieProcessor(cookiejar))
with opener.open(base_url, timeout=15) as response:
return response.read().decode("utf8")
+_MONTHS = {
+ name: i + 1
+ for i, name in enumerate(
+ ("янв", "фев", "мар", "апр", "мая", "июн", "июл", "авг", "сен", "окт", "ноя", "дек")
+ )
+}
+
+
+def _parse_date(value: str):
+ splitted = value.split()
+ if splitted[0].isdecimal():
+ day = int(splitted[0])
+ month = _MONTHS[splitted[1]]
+ if len(splitted) == 3:
+ year = int(splitted[2])
+ return datetime(year=year, month=month, day=day)
+ elif len(splitted) == 4 and splitted[2] == "в":
+ now = datetime.now()
+ year = now.year if month <= now.month else (now.year - 1)
+ hour, minute = [int(i) for i in splitted[3].split(":")]
+ return datetime(year=year, month=month, day=day, hour=hour, minute=minute)
+ else:
+ raise NotImplementedError()
+ else:
+ raise NotImplementedError()
+ return value
+
+
@dataclass()
class Text:
data: str
content: list = field(default_factory=list)
-class MyHTMLParser(HTMLParser):
+class _HTMLParser(HTMLParser):
def __init__(self, base_url, *args, **kwargs):
super().__init__(*args, **kwargs)
self.__base_url = base_url
if tag == "a":
if self.__current_attrs.get("class", "").startswith("PostHeaderTime"):
assert not self.__entry.updated, self.__entry
- self.__entry.updated = self.__current_attrs["data-date"]
+ self.__entry.updated = _parse_date(self.__current_attrs["data-date"])
assert not self.__entry.url, self.__entry
self.__entry.url = urljoin(self.__base_url, self.__current_attrs["href"])
elif self.__current_attrs.get("class", "") == "MediaGrid__interactive":
"https://m.vk.com/doodletimeru",
"https://m.vk.com/shantynatty",
):
- harvested = MyHTMLParser(base_url).feed(_get_html(base_url))
- print("*" * 80)
+ # print("*" * 80)
print(base_url)
- print("\n".join(str(entry) for entry in harvested))
- print(base_url)
+ harvested = _HTMLParser(base_url).feed(_get_html(base_url))
+ print("\n".join(str(entry.updated) for entry in harvested))
+ # print("\n".join(str(entry) for entry in harvested))
+ # print(base_url)