From eb0859fbaf66782f18419a016cedcca905fbbccd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 21:34:40 +0000 Subject: [PATCH] vk archiver --- src/archivers/__init__.py | 5 ++- src/archivers/vk_archiverv2.py | 67 ++++++++++++++++++++++++++++++++++ src/utils/misc.py | 11 ++++++ 3 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 src/archivers/vk_archiverv2.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index 8fb0265..d2a2c49 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -7,7 +7,7 @@ from .archiver import Archiverv2 from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver -from .vk_archiver import VkArchiver +# from .vk_archiver import VkArchiver # from .twitter_api_archiver import TwitterApiArchiver # from .instagram_archiver import InstagramArchiver @@ -16,4 +16,5 @@ from .twitter_archiverv2 import TwitterArchiver from .twitter_api_archiverv2 import TwitterApiArchiver from .instagram_archiverv2 import InstagramArchiver from .tiktok_archiverv2 import TiktokArchiver -from .telegram_archiverv2 import TelegramArchiver \ No newline at end of file +from .telegram_archiverv2 import TelegramArchiver +from .vk_archiverv2 import VkArchiver \ No newline at end of file diff --git a/src/archivers/vk_archiverv2.py b/src/archivers/vk_archiverv2.py new file mode 100644 index 0000000..147424d --- /dev/null +++ b/src/archivers/vk_archiverv2.py @@ -0,0 +1,67 @@ +import re, json, mimetypes, os + +from loguru import logger +from vk_url_scraper import VkScraper, DateTimeEncoder + +from metadata import Metadata +from media import Media +from utils.misc import dump_payload +from .archiver import Archiverv2 + + +class VkArchiver(Archiverv2): + """" + VK videos are handled by YTDownloader, this archiver gets posts text and images. + Currently only works for /wall posts + """ + name = "vk_archiver" + wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") + photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") + + def __init__(self, config: dict) -> None: + super().__init__(config) + self.assert_valid_string("username") + self.assert_valid_string("password") + self.vks = VkScraper(self.username, self.password, session_file=self.session_file) + + @staticmethod + def configs() -> dict: + return { + "username": {"default": None, "help": "valid VKontakte username"}, + "password": {"default": None, "help": "valid VKontakte password"}, + "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, + } + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + + if "vk.com" not in item.netloc: return False + + # some urls can contain multiple wall/photo/... parts and all will be fetched + vk_scrapes = self.vks.scrape(url) + if not len(vk_scrapes): return False + + result = Metadata() + for scrape in vk_scrapes: + if not result.get_title(): + result.set_title(scrape["text"]) + if not result.get_timestamp(): + result.set_timestamp(scrape["datetime"]) + + result.set_content(dump_payload(vk_scrapes)) + + textual_output = "" + title, datetime = vk_scrapes[0]["text"], vk_scrapes[0]["datetime"] + urls_found = [] + for scrape in vk_scrapes: + textual_output += f"id: {scrape['id']}
time utc: {scrape['datetime']}
text: {scrape['text']}
payload: {dump_payload(scrape['payload'])}


" + title = scrape["text"] if len(title) == 0 else title + datetime = scrape["datetime"] if not datetime else datetime + for attachments in scrape["attachments"].values(): + urls_found.extend(attachments) + + filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir()) + for filename in filenames: + result.add_media(Media(filename)) + + return result.success("vk") diff --git a/src/utils/misc.py b/src/utils/misc.py index 644c713..e7c5427 100644 --- a/src/utils/misc.py +++ b/src/utils/misc.py @@ -29,3 +29,14 @@ def getattr_or(o: object, prop: str, default=None): except: return default + +class DateTimeEncoder(json.JSONEncoder): + # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder) + def default(self, o): + if isinstance(o, datetime): + return str(o) # with timezone + return json.JSONEncoder.default(self, o) + + +def dump_payload(p): + return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)