From eb0859fbaf66782f18419a016cedcca905fbbccd Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 18 Jan 2023 21:34:40 +0000
Subject: [PATCH] vk archiver
---
src/archivers/__init__.py | 5 ++-
src/archivers/vk_archiverv2.py | 67 ++++++++++++++++++++++++++++++++++
src/utils/misc.py | 11 ++++++
3 files changed, 81 insertions(+), 2 deletions(-)
create mode 100644 src/archivers/vk_archiverv2.py
diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py
index 8fb0265..d2a2c49 100644
--- a/src/archivers/__init__.py
+++ b/src/archivers/__init__.py
@@ -7,7 +7,7 @@ from .archiver import Archiverv2
from .wayback_archiver import WaybackArchiver
from .youtubedl_archiver import YoutubeDLArchiver
# from .twitter_archiver import TwitterArchiver
-from .vk_archiver import VkArchiver
+# from .vk_archiver import VkArchiver
# from .twitter_api_archiver import TwitterApiArchiver
# from .instagram_archiver import InstagramArchiver
@@ -16,4 +16,5 @@ from .twitter_archiverv2 import TwitterArchiver
from .twitter_api_archiverv2 import TwitterApiArchiver
from .instagram_archiverv2 import InstagramArchiver
from .tiktok_archiverv2 import TiktokArchiver
-from .telegram_archiverv2 import TelegramArchiver
\ No newline at end of file
+from .telegram_archiverv2 import TelegramArchiver
+from .vk_archiverv2 import VkArchiver
\ No newline at end of file
diff --git a/src/archivers/vk_archiverv2.py b/src/archivers/vk_archiverv2.py
new file mode 100644
index 0000000..147424d
--- /dev/null
+++ b/src/archivers/vk_archiverv2.py
@@ -0,0 +1,67 @@
+import re, json, mimetypes, os
+
+from loguru import logger
+from vk_url_scraper import VkScraper, DateTimeEncoder
+
+from metadata import Metadata
+from media import Media
+from utils.misc import dump_payload
+from .archiver import Archiverv2
+
+
+class VkArchiver(Archiverv2):
+ """"
+ VK videos are handled by YTDownloader, this archiver gets posts text and images.
+ Currently only works for /wall posts
+ """
+ name = "vk_archiver"
+ wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
+ photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
+
+ def __init__(self, config: dict) -> None:
+ super().__init__(config)
+ self.assert_valid_string("username")
+ self.assert_valid_string("password")
+ self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "username": {"default": None, "help": "valid VKontakte username"},
+ "password": {"default": None, "help": "valid VKontakte password"},
+ "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
+ }
+
+ def download(self, item: Metadata) -> Metadata:
+ url = item.get_url()
+
+ if "vk.com" not in item.netloc: return False
+
+ # some urls can contain multiple wall/photo/... parts and all will be fetched
+ vk_scrapes = self.vks.scrape(url)
+ if not len(vk_scrapes): return False
+
+ result = Metadata()
+ for scrape in vk_scrapes:
+ if not result.get_title():
+ result.set_title(scrape["text"])
+ if not result.get_timestamp():
+ result.set_timestamp(scrape["datetime"])
+
+ result.set_content(dump_payload(vk_scrapes))
+
+ textual_output = ""
+ title, datetime = vk_scrapes[0]["text"], vk_scrapes[0]["datetime"]
+ urls_found = []
+ for scrape in vk_scrapes:
+ textual_output += f"id: {scrape['id']}
time utc: {scrape['datetime']}
text: {scrape['text']}
payload: {dump_payload(scrape['payload'])}