pull/33/head
msramalho 2022-05-09 17:45:54 +02:00
rodzic 0d65798308
commit 6bd6f88b46
6 zmienionych plików z 16 dodań i 16 usunięć

Wyświetl plik

@ -30,7 +30,6 @@ class ArchiveResult:
class Archiver(ABC):
name = "default"
TMP_FOLDER = "tmp/"
def __init__(self, storage: Storage, driver):
self.storage = storage
@ -61,7 +60,7 @@ class Archiver(ABC):
page += f"</body></html>"
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = Archiver.TMP_FOLDER + page_key
page_filename = Storage.TMP_FOLDER + page_key
page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f:
@ -86,7 +85,7 @@ class Archiver(ABC):
if '.' not in path:
key += '.jpg'
filename = Archiver.TMP_FOLDER + key
filename = Storage.TMP_FOLDER + key
d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f:
@ -127,7 +126,7 @@ class Archiver(ABC):
def get_screenshot(self, url):
key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = Archiver.TMP_FOLDER + key
filename = Storage.TMP_FOLDER + key
try:
self.driver.get(url)

Wyświetl plik

@ -6,6 +6,7 @@ import re
import html
from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class TelegramArchiver(Archiver):
@ -52,7 +53,7 @@ class TelegramArchiver(Archiver):
video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id)
filename = Archiver.TMP_FOLDER + key
filename = Storage.TMP_FOLDER + key
cdn_url = self.storage.get_cdn_url(key)
if check_if_exists and self.storage.exists(key):

Wyświetl plik

@ -70,8 +70,8 @@ class TelethonArchiver(Archiver):
message = post.message
for mp in media_posts:
if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'{Archiver.TMP_FOLDER}{chat}_{group_id}/{mp.id}')
key = filename.split(Archiver.TMP_FOLDER)[1]
filename = self.client.download_media(mp.media, f'{Storage.TMP_FOLDER}{chat}_{group_id}/{mp.id}')
key = filename.split(Storage.TMP_FOLDER)[1]
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
@ -83,8 +83,8 @@ class TelethonArchiver(Archiver):
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'{Archiver.TMP_FOLDER}{key}')
key = filename.split(Archiver.TMP_FOLDER)[1].replace(" ", "")
filename = self.client.download_media(post.media, f'{Storage.TMP_FOLDER}{key}')
key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "")
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)

Wyświetl plik

@ -3,7 +3,7 @@ import tiktok_downloader
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class TiktokArchiver(Archiver):
name = "tiktok"
@ -18,7 +18,7 @@ class TiktokArchiver(Archiver):
info = tiktok_downloader.info_post(url)
key = self.get_key(f'{info.id}.mp4')
cdn_url = self.storage.get_cdn_url(key)
filename = Archiver.TMP_FOLDER + key
filename = Storage.TMP_FOLDER + key
if check_if_exists and self.storage.exists(key):
status = 'already archived'

Wyświetl plik

@ -5,11 +5,11 @@ import yt_dlp
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class YoutubeDLArchiver(Archiver):
name = "youtube_dl"
ydl_opts = {'outtmpl': f'{Archiver.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url)

Wyświetl plik

@ -9,7 +9,7 @@ from utils.gworksheet import GWorksheet
from storages import S3Config, S3Storage
from .wayback_config import WaybackConfig
from .telegram_config import TelegramConfig
from archivers import Archiver
from storages import Storage
class Config:
@ -45,8 +45,8 @@ class Config:
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
self.header = int(getattr(self.args, "header") or execution.get("header", 1))
self.tmp_folder = execution.get("tmp_folder", Archiver.TMP_FOLDER)
Archiver.TMP_FOLDER = self.tmp_folder
self.tmp_folder = execution.get("tmp_folder", Storage.TMP_FOLDER)
Storage.TMP_FOLDER = self.tmp_folder
self.storage = execution.get("storage", "s3")