From 8a8251d622886c38d4db0d6a6783a6876806d198 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 21 Jun 2022 01:44:48 +0200 Subject: [PATCH] fix in upstream lib for filenames --- Pipfile.lock | 12 +++++++----- archivers/vk_archiver.py | 35 ++++++++++++++++++++++++----------- utils/misc.py | 7 ------- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 7a0b68c..9cd48b3 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -875,7 +875,6 @@ "version": "==2022.3.2" }, "requests": { - "extras": [], "hashes": [ "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" @@ -1039,7 +1038,10 @@ "version": "==4.1.1" }, "urllib3": { - "extras": [], + "extras": [ + "secure", + "socks" + ], "hashes": [ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" @@ -1056,11 +1058,11 @@ }, "vk-url-scraper": { "hashes": [ - "sha256:1d98d593c6e5960b2b3334b3f34676a4315da480a3f76a417606e14558c392d7", - "sha256:4c47b251e1cd1b58b385b2002d3a6afadc0397991615139814dd6fbfaa2f529b" + "sha256:1747e926dfa5f802b4960347db0d5f7425f69838d1444d2bbee6b5b168524e43", + "sha256:7539df9de4f6c70db303efc52557582eae7fc3c85b34dc7137e75d4928598078" ], "index": "pypi", - "version": "==0.2.1" + "version": "==0.2.4" }, "websockets": { "hashes": [ diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index 0f2c0ac..8d4b195 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -1,8 +1,7 @@ -import re, json +import re, json, mimetypes from loguru import logger -from utils.misc import DateTimeEncoder -from vk_url_scraper import VkScraper +from vk_url_scraper import VkScraper, DateTimeEncoder from storages import Storage from .base_archiver import Archiver, ArchiveResult @@ -38,19 +37,33 @@ class VkArchiver(Archiver): if len(results) == 0: return False - - dump_payload = lambda p : json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder) + def dump_payload(p): return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder) textual_output = "" - title, time = results[0]["text"], results[0]["datetime"] + title, datetime = results[0]["text"], results[0]["datetime"] urls_found = [] for res in results: - textual_output+= f"id: {res['id']}
time utc: {res['datetime']}
text: {res['text']}
payload: {dump_payload(res['payload'])}


" + textual_output += f"id: {res['id']}
time utc: {res['datetime']}
text: {res['text']}
payload: {dump_payload(res['payload'])}


" title = res["text"] if len(title) == 0 else title - time = res["datetime"] if not time else time + datetime = res["datetime"] if not datetime else datetime for attachments in res["attachments"].values(): urls_found.extend(attachments) - page_cdn, page_hash, thumbnail = self.generate_media_page(urls_found, url, textual_output) - # if multiple wall/photos/videos are present the screenshot will only grab the 1st + # we don't call generate_media_page which downloads urls because it cannot download vk video urls + thumbnail = None + uploaded_media = [] + filenames = self.vks.download_media(results, Storage.TMP_FOLDER) + for filename in filenames: + key = self.get_key(filename) + self.storage.upload(filename, key) + hash = self.get_hash(filename) + cdn_url = self.storage.get_cdn_url(key) + try: + if mimetypes.guess_type(filename)[0].split("/")[0] == "image" and thumbnail is None: + thumbnail = cdn_url + except: pass + uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + + page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail) + # # if multiple wall/photos/videos are present the screenshot will only grab the 1st screenshot = self.get_screenshot(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=datetime, title=title) diff --git a/utils/misc.py b/utils/misc.py index 55a72f5..644c713 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -29,10 +29,3 @@ def getattr_or(o: object, prop: str, default=None): except: return default - -class DateTimeEncoder(json.JSONEncoder): - # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder) - def default(self, o): - if isinstance(o, datetime): - return str(o) # with timezone - return json.JSONEncoder.default(self, o)