dding thumbnails

pull/44/head
msramalho 2022-06-21 15:39:13 +02:00
rodzic 8a8251d622
commit c4efa6e597
3 zmienionych plików z 21 dodań i 19 usunięć

11
Pipfile.lock wygenerowano
Wyświetl plik

@ -1038,10 +1038,7 @@
"version": "==4.1.1"
},
"urllib3": {
"extras": [
"secure",
"socks"
],
"extras": [],
"hashes": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
@ -1058,11 +1055,11 @@
},
"vk-url-scraper": {
"hashes": [
"sha256:1747e926dfa5f802b4960347db0d5f7425f69838d1444d2bbee6b5b168524e43",
"sha256:7539df9de4f6c70db303efc52557582eae7fc3c85b34dc7137e75d4928598078"
"sha256:181c8a4b69e395a68bdf00e3dc1717e5218960c9fda6e90eea9633ff26fc9257",
"sha256:9cfc6bc3d7259f18508c3822955efac21ff9bad5bd886010b10f098ea10ad551"
],
"index": "pypi",
"version": "==0.2.4"
"version": "==0.3.2"
},
"websockets": {
"hashes": [

Wyświetl plik

@ -197,8 +197,9 @@ class Archiver(ABC):
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = filename.split('.')[0] + '/'
key_folder = key.split('.')[0] + '/'
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
key_folder = key.split('.')[0] + os.path.sep
logger.info(f"{filename=} {thumbnails_folder=} {key_folder=} ")
mkdir_if_not_exists(thumbnails_folder)
@ -222,7 +223,7 @@ class Archiver(ABC):
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname
key = os.path.join(key_folder, fname)
self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)

Wyświetl plik

@ -1,4 +1,4 @@
import re, json, mimetypes
import re, json, mimetypes, os
from loguru import logger
from vk_url_scraper import VkScraper, DateTimeEncoder
@ -28,10 +28,10 @@ class VkArchiver(Archiver):
return False
key = self.get_html_key(url)
if check_if_exists and self.storage.exists(key):
screenshot = self.get_screenshot(url)
cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
# if check_if_exists and self.storage.exists(key):
# screenshot = self.get_screenshot(url)
# cdn_url = self.storage.get_cdn_url(key)
# return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
if len(results) == 0:
@ -49,7 +49,7 @@ class VkArchiver(Archiver):
urls_found.extend(attachments)
# we don't call generate_media_page which downloads urls because it cannot download vk video urls
thumbnail = None
thumbnail, thumbnail_index = None, None
uploaded_media = []
filenames = self.vks.download_media(results, Storage.TMP_FOLDER)
for filename in filenames:
@ -58,12 +58,16 @@ class VkArchiver(Archiver):
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
try:
if mimetypes.guess_type(filename)[0].split("/")[0] == "image" and thumbnail is None:
_type = mimetypes.guess_type(filename)[0].split("/")[0]
if _type == "image" and thumbnail is None:
thumbnail = cdn_url
except: pass
if _type == "video" and (thumbnail is None or thumbnail_index is None):
thumbnail, thumbnail_index = self.get_thumbnails(filename, key)
except Exception as e:
logger.warning(f"failed to get thumb for {filename=} with {e=}")
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=datetime, title=title)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title)