kopia lustrzana https://github.com/bellingcat/auto-archiver
dding thumbnails
rodzic
8a8251d622
commit
c4efa6e597
|
@ -1038,10 +1038,7 @@
|
|||
"version": "==4.1.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"extras": [
|
||||
"secure",
|
||||
"socks"
|
||||
],
|
||||
"extras": [],
|
||||
"hashes": [
|
||||
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
|
||||
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
|
||||
|
@ -1058,11 +1055,11 @@
|
|||
},
|
||||
"vk-url-scraper": {
|
||||
"hashes": [
|
||||
"sha256:1747e926dfa5f802b4960347db0d5f7425f69838d1444d2bbee6b5b168524e43",
|
||||
"sha256:7539df9de4f6c70db303efc52557582eae7fc3c85b34dc7137e75d4928598078"
|
||||
"sha256:181c8a4b69e395a68bdf00e3dc1717e5218960c9fda6e90eea9633ff26fc9257",
|
||||
"sha256:9cfc6bc3d7259f18508c3822955efac21ff9bad5bd886010b10f098ea10ad551"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.2.4"
|
||||
"version": "==0.3.2"
|
||||
},
|
||||
"websockets": {
|
||||
"hashes": [
|
||||
|
|
|
@ -197,8 +197,9 @@ class Archiver(ABC):
|
|||
return self.storage.get_cdn_url(key)
|
||||
|
||||
def get_thumbnails(self, filename, key, duration=None):
|
||||
thumbnails_folder = filename.split('.')[0] + '/'
|
||||
key_folder = key.split('.')[0] + '/'
|
||||
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
|
||||
key_folder = key.split('.')[0] + os.path.sep
|
||||
logger.info(f"{filename=} {thumbnails_folder=} {key_folder=} ")
|
||||
|
||||
mkdir_if_not_exists(thumbnails_folder)
|
||||
|
||||
|
@ -222,7 +223,7 @@ class Archiver(ABC):
|
|||
for fname in thumbnails:
|
||||
if fname[-3:] == 'jpg':
|
||||
thumbnail_filename = thumbnails_folder + fname
|
||||
key = key_folder + fname
|
||||
key = os.path.join(key_folder, fname)
|
||||
|
||||
self.storage.upload(thumbnail_filename, key)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import re, json, mimetypes
|
||||
import re, json, mimetypes, os
|
||||
|
||||
from loguru import logger
|
||||
from vk_url_scraper import VkScraper, DateTimeEncoder
|
||||
|
@ -28,10 +28,10 @@ class VkArchiver(Archiver):
|
|||
return False
|
||||
|
||||
key = self.get_html_key(url)
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
screenshot = self.get_screenshot(url)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
|
||||
# if check_if_exists and self.storage.exists(key):
|
||||
# screenshot = self.get_screenshot(url)
|
||||
# cdn_url = self.storage.get_cdn_url(key)
|
||||
# return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
|
||||
|
||||
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||
if len(results) == 0:
|
||||
|
@ -49,7 +49,7 @@ class VkArchiver(Archiver):
|
|||
urls_found.extend(attachments)
|
||||
|
||||
# we don't call generate_media_page which downloads urls because it cannot download vk video urls
|
||||
thumbnail = None
|
||||
thumbnail, thumbnail_index = None, None
|
||||
uploaded_media = []
|
||||
filenames = self.vks.download_media(results, Storage.TMP_FOLDER)
|
||||
for filename in filenames:
|
||||
|
@ -58,12 +58,16 @@ class VkArchiver(Archiver):
|
|||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
try:
|
||||
if mimetypes.guess_type(filename)[0].split("/")[0] == "image" and thumbnail is None:
|
||||
_type = mimetypes.guess_type(filename)[0].split("/")[0]
|
||||
if _type == "image" and thumbnail is None:
|
||||
thumbnail = cdn_url
|
||||
except: pass
|
||||
if _type == "video" and (thumbnail is None or thumbnail_index is None):
|
||||
thumbnail, thumbnail_index = self.get_thumbnails(filename, key)
|
||||
except Exception as e:
|
||||
logger.warning(f"failed to get thumb for {filename=} with {e=}")
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
|
||||
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
||||
screenshot = self.get_screenshot(url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=datetime, title=title)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title)
|
||||
|
|
Ładowanie…
Reference in New Issue