pull/33/head
msramalho 2022-05-09 17:45:54 +02:00
rodzic 0d65798308
commit 6bd6f88b46
6 zmienionych plików z 16 dodań i 16 usunięć

Wyświetl plik

@ -30,7 +30,6 @@ class ArchiveResult:
class Archiver(ABC): class Archiver(ABC):
name = "default" name = "default"
TMP_FOLDER = "tmp/"
def __init__(self, storage: Storage, driver): def __init__(self, storage: Storage, driver):
self.storage = storage self.storage = storage
@ -61,7 +60,7 @@ class Archiver(ABC):
page += f"</body></html>" page += f"</body></html>"
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = Archiver.TMP_FOLDER + page_key page_filename = Storage.TMP_FOLDER + page_key
page_cdn = self.storage.get_cdn_url(page_key) page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f: with open(page_filename, "w") as f:
@ -86,7 +85,7 @@ class Archiver(ABC):
if '.' not in path: if '.' not in path:
key += '.jpg' key += '.jpg'
filename = Archiver.TMP_FOLDER + key filename = Storage.TMP_FOLDER + key
d = requests.get(media_url, headers=headers) d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
@ -127,7 +126,7 @@ class Archiver(ABC):
def get_screenshot(self, url): def get_screenshot(self, url):
key = self.get_key(urlparse(url).path.replace( key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = Archiver.TMP_FOLDER + key filename = Storage.TMP_FOLDER + key
try: try:
self.driver.get(url) self.driver.get(url)

Wyświetl plik

@ -6,6 +6,7 @@ import re
import html import html
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class TelegramArchiver(Archiver): class TelegramArchiver(Archiver):
@ -52,7 +53,7 @@ class TelegramArchiver(Archiver):
video_id = video_url.split('/')[-1].split('?')[0] video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id) key = self.get_key(video_id)
filename = Archiver.TMP_FOLDER + key filename = Storage.TMP_FOLDER + key
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
if check_if_exists and self.storage.exists(key): if check_if_exists and self.storage.exists(key):

Wyświetl plik

@ -70,8 +70,8 @@ class TelethonArchiver(Archiver):
message = post.message message = post.message
for mp in media_posts: for mp in media_posts:
if len(mp.message) > len(message): message = mp.message if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'{Archiver.TMP_FOLDER}{chat}_{group_id}/{mp.id}') filename = self.client.download_media(mp.media, f'{Storage.TMP_FOLDER}{chat}_{group_id}/{mp.id}')
key = filename.split(Archiver.TMP_FOLDER)[1] key = filename.split(Storage.TMP_FOLDER)[1]
self.storage.upload(filename, key) self.storage.upload(filename, key)
hash = self.get_hash(filename) hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
@ -83,8 +83,8 @@ class TelethonArchiver(Archiver):
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1: elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}') key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'{Archiver.TMP_FOLDER}{key}') filename = self.client.download_media(post.media, f'{Storage.TMP_FOLDER}{key}')
key = filename.split(Archiver.TMP_FOLDER)[1].replace(" ", "") key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "")
self.storage.upload(filename, key) self.storage.upload(filename, key)
hash = self.get_hash(filename) hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)

Wyświetl plik

@ -3,7 +3,7 @@ import tiktok_downloader
from loguru import logger from loguru import logger
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class TiktokArchiver(Archiver): class TiktokArchiver(Archiver):
name = "tiktok" name = "tiktok"
@ -18,7 +18,7 @@ class TiktokArchiver(Archiver):
info = tiktok_downloader.info_post(url) info = tiktok_downloader.info_post(url)
key = self.get_key(f'{info.id}.mp4') key = self.get_key(f'{info.id}.mp4')
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
filename = Archiver.TMP_FOLDER + key filename = Storage.TMP_FOLDER + key
if check_if_exists and self.storage.exists(key): if check_if_exists and self.storage.exists(key):
status = 'already archived' status = 'already archived'

Wyświetl plik

@ -5,11 +5,11 @@ import yt_dlp
from loguru import logger from loguru import logger
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class YoutubeDLArchiver(Archiver): class YoutubeDLArchiver(Archiver):
name = "youtube_dl" name = "youtube_dl"
ydl_opts = {'outtmpl': f'{Archiver.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url) netloc = self.get_netloc(url)

Wyświetl plik

@ -9,7 +9,7 @@ from utils.gworksheet import GWorksheet
from storages import S3Config, S3Storage from storages import S3Config, S3Storage
from .wayback_config import WaybackConfig from .wayback_config import WaybackConfig
from .telegram_config import TelegramConfig from .telegram_config import TelegramConfig
from archivers import Archiver from storages import Storage
class Config: class Config:
@ -45,8 +45,8 @@ class Config:
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
self.header = int(getattr(self.args, "header") or execution.get("header", 1)) self.header = int(getattr(self.args, "header") or execution.get("header", 1))
self.tmp_folder = execution.get("tmp_folder", Archiver.TMP_FOLDER) self.tmp_folder = execution.get("tmp_folder", Storage.TMP_FOLDER)
Archiver.TMP_FOLDER = self.tmp_folder Storage.TMP_FOLDER = self.tmp_folder
self.storage = execution.get("storage", "s3") self.storage = execution.get("storage", "s3")