kopia lustrzana https://github.com/bellingcat/auto-archiver
fix in upstream lib for filenames
rodzic
74d421dc94
commit
8a8251d622
|
@ -875,7 +875,6 @@
|
|||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [],
|
||||
"hashes": [
|
||||
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
|
||||
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
|
||||
|
@ -1039,7 +1038,10 @@
|
|||
"version": "==4.1.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"extras": [],
|
||||
"extras": [
|
||||
"secure",
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
|
||||
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
|
||||
|
@ -1056,11 +1058,11 @@
|
|||
},
|
||||
"vk-url-scraper": {
|
||||
"hashes": [
|
||||
"sha256:1d98d593c6e5960b2b3334b3f34676a4315da480a3f76a417606e14558c392d7",
|
||||
"sha256:4c47b251e1cd1b58b385b2002d3a6afadc0397991615139814dd6fbfaa2f529b"
|
||||
"sha256:1747e926dfa5f802b4960347db0d5f7425f69838d1444d2bbee6b5b168524e43",
|
||||
"sha256:7539df9de4f6c70db303efc52557582eae7fc3c85b34dc7137e75d4928598078"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.2.1"
|
||||
"version": "==0.2.4"
|
||||
},
|
||||
"websockets": {
|
||||
"hashes": [
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
import re, json
|
||||
import re, json, mimetypes
|
||||
|
||||
from loguru import logger
|
||||
from utils.misc import DateTimeEncoder
|
||||
from vk_url_scraper import VkScraper
|
||||
from vk_url_scraper import VkScraper, DateTimeEncoder
|
||||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
@ -38,19 +37,33 @@ class VkArchiver(Archiver):
|
|||
if len(results) == 0:
|
||||
return False
|
||||
|
||||
|
||||
dump_payload = lambda p : json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
def dump_payload(p): return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
textual_output = ""
|
||||
title, time = results[0]["text"], results[0]["datetime"]
|
||||
title, datetime = results[0]["text"], results[0]["datetime"]
|
||||
urls_found = []
|
||||
for res in results:
|
||||
textual_output+= f"id: {res['id']}<br>time utc: {res['datetime']}<br>text: {res['text']}<br>payload: {dump_payload(res['payload'])}<br><hr/><br>"
|
||||
textual_output += f"id: {res['id']}<br>time utc: {res['datetime']}<br>text: {res['text']}<br>payload: {dump_payload(res['payload'])}<br><hr/><br>"
|
||||
title = res["text"] if len(title) == 0 else title
|
||||
time = res["datetime"] if not time else time
|
||||
datetime = res["datetime"] if not datetime else datetime
|
||||
for attachments in res["attachments"].values():
|
||||
urls_found.extend(attachments)
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls_found, url, textual_output)
|
||||
# if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
||||
# we don't call generate_media_page which downloads urls because it cannot download vk video urls
|
||||
thumbnail = None
|
||||
uploaded_media = []
|
||||
filenames = self.vks.download_media(results, Storage.TMP_FOLDER)
|
||||
for filename in filenames:
|
||||
key = self.get_key(filename)
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
try:
|
||||
if mimetypes.guess_type(filename)[0].split("/")[0] == "image" and thumbnail is None:
|
||||
thumbnail = cdn_url
|
||||
except: pass
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
|
||||
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
||||
screenshot = self.get_screenshot(url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=datetime, title=title)
|
||||
|
|
|
@ -29,10 +29,3 @@ def getattr_or(o: object, prop: str, default=None):
|
|||
except:
|
||||
return default
|
||||
|
||||
|
||||
class DateTimeEncoder(json.JSONEncoder):
|
||||
# to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder)
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime):
|
||||
return str(o) # with timezone
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
|
Ładowanie…
Reference in New Issue