fix in upstream lib for filenames

pull/44/head
msramalho 2022-06-21 01:44:48 +02:00
rodzic 74d421dc94
commit 8a8251d622
3 zmienionych plików z 31 dodań i 23 usunięć

12
Pipfile.lock wygenerowano
Wyświetl plik

@ -875,7 +875,6 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [],
"hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
@ -1039,7 +1038,10 @@
"version": "==4.1.1"
},
"urllib3": {
"extras": [],
"extras": [
"secure",
"socks"
],
"hashes": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
@ -1056,11 +1058,11 @@
},
"vk-url-scraper": {
"hashes": [
"sha256:1d98d593c6e5960b2b3334b3f34676a4315da480a3f76a417606e14558c392d7",
"sha256:4c47b251e1cd1b58b385b2002d3a6afadc0397991615139814dd6fbfaa2f529b"
"sha256:1747e926dfa5f802b4960347db0d5f7425f69838d1444d2bbee6b5b168524e43",
"sha256:7539df9de4f6c70db303efc52557582eae7fc3c85b34dc7137e75d4928598078"
],
"index": "pypi",
"version": "==0.2.1"
"version": "==0.2.4"
},
"websockets": {
"hashes": [

Wyświetl plik

@ -1,8 +1,7 @@
import re, json
import re, json, mimetypes
from loguru import logger
from utils.misc import DateTimeEncoder
from vk_url_scraper import VkScraper
from vk_url_scraper import VkScraper, DateTimeEncoder
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
@ -38,19 +37,33 @@ class VkArchiver(Archiver):
if len(results) == 0:
return False
dump_payload = lambda p : json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
def dump_payload(p): return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
textual_output = ""
title, time = results[0]["text"], results[0]["datetime"]
title, datetime = results[0]["text"], results[0]["datetime"]
urls_found = []
for res in results:
textual_output+= f"id: {res['id']}<br>time utc: {res['datetime']}<br>text: {res['text']}<br>payload: {dump_payload(res['payload'])}<br><hr/><br>"
textual_output += f"id: {res['id']}<br>time utc: {res['datetime']}<br>text: {res['text']}<br>payload: {dump_payload(res['payload'])}<br><hr/><br>"
title = res["text"] if len(title) == 0 else title
time = res["datetime"] if not time else time
datetime = res["datetime"] if not datetime else datetime
for attachments in res["attachments"].values():
urls_found.extend(attachments)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls_found, url, textual_output)
# if multiple wall/photos/videos are present the screenshot will only grab the 1st
# we don't call generate_media_page which downloads urls because it cannot download vk video urls
thumbnail = None
uploaded_media = []
filenames = self.vks.download_media(results, Storage.TMP_FOLDER)
for filename in filenames:
key = self.get_key(filename)
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
try:
if mimetypes.guess_type(filename)[0].split("/")[0] == "image" and thumbnail is None:
thumbnail = cdn_url
except: pass
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=datetime, title=title)

Wyświetl plik

@ -29,10 +29,3 @@ def getattr_or(o: object, prop: str, default=None):
except:
return default
class DateTimeEncoder(json.JSONEncoder):
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
def default(self, o):
if isinstance(o, datetime):
return str(o) # with timezone
return json.JSONEncoder.default(self, o)