kopia lustrzana https://github.com/bellingcat/auto-archiver
refactor
rodzic
0d65798308
commit
6bd6f88b46
|
@ -30,7 +30,6 @@ class ArchiveResult:
|
|||
|
||||
class Archiver(ABC):
|
||||
name = "default"
|
||||
TMP_FOLDER = "tmp/"
|
||||
|
||||
def __init__(self, storage: Storage, driver):
|
||||
self.storage = storage
|
||||
|
@ -61,7 +60,7 @@ class Archiver(ABC):
|
|||
page += f"</body></html>"
|
||||
|
||||
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
page_filename = Archiver.TMP_FOLDER + page_key
|
||||
page_filename = Storage.TMP_FOLDER + page_key
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
|
||||
with open(page_filename, "w") as f:
|
||||
|
@ -86,7 +85,7 @@ class Archiver(ABC):
|
|||
if '.' not in path:
|
||||
key += '.jpg'
|
||||
|
||||
filename = Archiver.TMP_FOLDER + key
|
||||
filename = Storage.TMP_FOLDER + key
|
||||
|
||||
d = requests.get(media_url, headers=headers)
|
||||
with open(filename, 'wb') as f:
|
||||
|
@ -127,7 +126,7 @@ class Archiver(ABC):
|
|||
def get_screenshot(self, url):
|
||||
key = self.get_key(urlparse(url).path.replace(
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = Archiver.TMP_FOLDER + key
|
||||
filename = Storage.TMP_FOLDER + key
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
|
|
|
@ -6,6 +6,7 @@ import re
|
|||
import html
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from storages import Storage
|
||||
|
||||
|
||||
class TelegramArchiver(Archiver):
|
||||
|
@ -52,7 +53,7 @@ class TelegramArchiver(Archiver):
|
|||
video_id = video_url.split('/')[-1].split('?')[0]
|
||||
key = self.get_key(video_id)
|
||||
|
||||
filename = Archiver.TMP_FOLDER + key
|
||||
filename = Storage.TMP_FOLDER + key
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
|
|
|
@ -70,8 +70,8 @@ class TelethonArchiver(Archiver):
|
|||
message = post.message
|
||||
for mp in media_posts:
|
||||
if len(mp.message) > len(message): message = mp.message
|
||||
filename = self.client.download_media(mp.media, f'{Archiver.TMP_FOLDER}{chat}_{group_id}/{mp.id}')
|
||||
key = filename.split(Archiver.TMP_FOLDER)[1]
|
||||
filename = self.client.download_media(mp.media, f'{Storage.TMP_FOLDER}{chat}_{group_id}/{mp.id}')
|
||||
key = filename.split(Storage.TMP_FOLDER)[1]
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
@ -83,8 +83,8 @@ class TelethonArchiver(Archiver):
|
|||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
||||
elif len(media_posts) == 1:
|
||||
key = self.get_key(f'{chat}_{post_id}')
|
||||
filename = self.client.download_media(post.media, f'{Archiver.TMP_FOLDER}{key}')
|
||||
key = filename.split(Archiver.TMP_FOLDER)[1].replace(" ", "")
|
||||
filename = self.client.download_media(post.media, f'{Storage.TMP_FOLDER}{key}')
|
||||
key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "")
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
|
|
@ -3,7 +3,7 @@ import tiktok_downloader
|
|||
from loguru import logger
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
from storages import Storage
|
||||
|
||||
class TiktokArchiver(Archiver):
|
||||
name = "tiktok"
|
||||
|
@ -18,7 +18,7 @@ class TiktokArchiver(Archiver):
|
|||
info = tiktok_downloader.info_post(url)
|
||||
key = self.get_key(f'{info.id}.mp4')
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
filename = Archiver.TMP_FOLDER + key
|
||||
filename = Storage.TMP_FOLDER + key
|
||||
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
status = 'already archived'
|
||||
|
|
|
@ -5,11 +5,11 @@ import yt_dlp
|
|||
from loguru import logger
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
from storages import Storage
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "youtube_dl"
|
||||
ydl_opts = {'outtmpl': f'{Archiver.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
netloc = self.get_netloc(url)
|
||||
|
|
|
@ -9,7 +9,7 @@ from utils.gworksheet import GWorksheet
|
|||
from storages import S3Config, S3Storage
|
||||
from .wayback_config import WaybackConfig
|
||||
from .telegram_config import TelegramConfig
|
||||
from archivers import Archiver
|
||||
from storages import Storage
|
||||
|
||||
|
||||
class Config:
|
||||
|
@ -45,8 +45,8 @@ class Config:
|
|||
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
||||
|
||||
self.header = int(getattr(self.args, "header") or execution.get("header", 1))
|
||||
self.tmp_folder = execution.get("tmp_folder", Archiver.TMP_FOLDER)
|
||||
Archiver.TMP_FOLDER = self.tmp_folder
|
||||
self.tmp_folder = execution.get("tmp_folder", Storage.TMP_FOLDER)
|
||||
Storage.TMP_FOLDER = self.tmp_folder
|
||||
|
||||
self.storage = execution.get("storage", "s3")
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue