commit 3ac0752c060d3243f4fabbfb6627e2785a0ebc05 from: Aleksey Ryndin date: Thu Jan 16 08:00:31 2025 UTC Add _parse_date commit - 53cff30f688fa4c6de0b531106aa693125ebbfa3 commit + 3ac0752c060d3243f4fabbfb6627e2785a0ebc05 blob - 4a503df4bd8722ca47fe51f3a075e73d1f8e7d7a blob + c9fff8bed243e98a938761c0d7bfa7e6515938ef --- avk.py +++ avk.py @@ -7,7 +7,7 @@ from datetime import datetime from json import loads as json_loads -def _get_html(base_url): +def _get_html(base_url: str): cookiejar = CookieJar() opener = build_opener(HTTPCookieProcessor(cookiejar)) with opener.open(base_url, timeout=15) as response: @@ -41,6 +41,34 @@ def _get_html(base_url): return response.read().decode("utf8") +_MONTHS = { + name: i + 1 + for i, name in enumerate( + ("янв", "фев", "мар", "апр", "мая", "июн", "июл", "авг", "сен", "окт", "ноя", "дек") + ) +} + + +def _parse_date(value: str): + splitted = value.split() + if splitted[0].isdecimal(): + day = int(splitted[0]) + month = _MONTHS[splitted[1]] + if len(splitted) == 3: + year = int(splitted[2]) + return datetime(year=year, month=month, day=day) + elif len(splitted) == 4 and splitted[2] == "в": + now = datetime.now() + year = now.year if month <= now.month else (now.year - 1) + hour, minute = [int(i) for i in splitted[3].split(":")] + return datetime(year=year, month=month, day=day, hour=hour, minute=minute) + else: + raise NotImplementedError() + else: + raise NotImplementedError() + return value + + @dataclass() class Text: data: str @@ -82,7 +110,7 @@ class Entry: content: list = field(default_factory=list) -class MyHTMLParser(HTMLParser): +class _HTMLParser(HTMLParser): def __init__(self, base_url, *args, **kwargs): super().__init__(*args, **kwargs) self.__base_url = base_url @@ -103,7 +131,7 @@ class MyHTMLParser(HTMLParser): if tag == "a": if self.__current_attrs.get("class", "").startswith("PostHeaderTime"): assert not self.__entry.updated, self.__entry - self.__entry.updated = self.__current_attrs["data-date"] + self.__entry.updated = _parse_date(self.__current_attrs["data-date"]) assert not self.__entry.url, self.__entry self.__entry.url = urljoin(self.__base_url, self.__current_attrs["href"]) elif self.__current_attrs.get("class", "") == "MediaGrid__interactive": @@ -184,8 +212,9 @@ for base_url in ( "https://m.vk.com/doodletimeru", "https://m.vk.com/shantynatty", ): - harvested = MyHTMLParser(base_url).feed(_get_html(base_url)) - print("*" * 80) + # print("*" * 80) print(base_url) - print("\n".join(str(entry) for entry in harvested)) - print(base_url) + harvested = _HTMLParser(base_url).feed(_get_html(base_url)) + print("\n".join(str(entry.updated) for entry in harvested)) + # print("\n".join(str(entry) for entry in harvested)) + # print(base_url)