kopia lustrzana https://github.com/bellingcat/auto-archiver
WIP docker changes for cli and auto_archiver
rodzic
390b84eb22
commit
04263094ad
|
@ -18,16 +18,17 @@ RUN pip install --upgrade pip && \
|
|||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||
# RUN curl -fsSL https://get.docker.com | sh
|
||||
|
||||
# RUN git clone https://github.com/bellingcat/auto-archiver
|
||||
# TODO: avoid copying unnecessary files, including .git
|
||||
COPY Pipfile Pipfile.lock ./
|
||||
RUN pipenv install --python=3.10 --system --deploy
|
||||
ENV IS_DOCKER=1
|
||||
COPY ./src/ .
|
||||
|
||||
# CMD ["pipenv", "run", "python", "auto_archive.py"]
|
||||
ENTRYPOINT ["python", "auto_archive.py"]
|
||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# USER archiver
|
||||
ENTRYPOINT ["python"]
|
||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
|
||||
# should be executed with 2 volumes
|
||||
# should be executed with 2 volumes (3 if local_storage)
|
||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
|
@ -1,8 +1,9 @@
|
|||
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
from random import randrange
|
||||
from collections import defaultdict
|
||||
|
||||
import ffmpeg
|
||||
from loguru import logger
|
||||
|
@ -27,6 +28,7 @@ class ArchiveResult:
|
|||
screenshot: str = None
|
||||
wacz: str = None
|
||||
hash: str = None
|
||||
media: list = field(default_factory=list)
|
||||
|
||||
class Archiver(ABC):
|
||||
name = "default"
|
||||
|
@ -38,6 +40,7 @@ class Archiver(ABC):
|
|||
self.hash_algorithm = config.hash_algorithm
|
||||
self.browsertrix = config.browsertrix_config
|
||||
self.is_docker = config.is_docker
|
||||
self.media = []
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
@ -48,13 +51,28 @@ class Archiver(ABC):
|
|||
@abstractmethod
|
||||
def download(self, url, check_if_exists=False): pass
|
||||
|
||||
def generateArchiveResult(self, **kwargs):
|
||||
# remove duplicates
|
||||
if "cdn_url" in kwargs:
|
||||
self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
|
||||
kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
|
||||
return ArchiveResult(**kwargs)
|
||||
|
||||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
|
||||
def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
|
||||
media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
|
||||
if key: media_info["key"] = key
|
||||
if hash: media_info["hash"] = hash
|
||||
self.media.append(media_info)
|
||||
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||
"""
|
||||
Generates an index.html page where each @urls_info is displayed
|
||||
"""
|
||||
for ui in urls_info:
|
||||
self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
|
||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||
<body>
|
||||
<h2>Archived media from {self.name}</h2>
|
||||
|
@ -109,6 +127,8 @@ class Archiver(ABC):
|
|||
For a list of media urls, fetch them, upload them
|
||||
and call self.generate_media_page_html with them
|
||||
"""
|
||||
for media_url in urls:
|
||||
self.add_to_media(media_url)
|
||||
|
||||
thumbnail = None
|
||||
uploaded_media = []
|
||||
|
@ -201,17 +221,20 @@ class Archiver(ABC):
|
|||
self.driver.save_screenshot(filename)
|
||||
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
|
||||
return self.storage.get_cdn_url(key)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
self.add_to_media(cdn_url, key)
|
||||
|
||||
return cdn_url
|
||||
|
||||
def get_wacz(self, url):
|
||||
if not self.browsertrix.enabled:
|
||||
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
|
||||
return
|
||||
return
|
||||
if self.is_docker:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
# see: https://github.com/bellingcat/auto-archiver/issues/66
|
||||
logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
|
||||
return
|
||||
return
|
||||
|
||||
logger.debug(f"getting wacz for {url}")
|
||||
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
||||
|
@ -220,7 +243,7 @@ class Archiver(ABC):
|
|||
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
|
@ -253,18 +276,19 @@ class Archiver(ABC):
|
|||
# do not crash if upload fails
|
||||
try:
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
||||
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
||||
except FileNotFoundError as e:
|
||||
logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
|
||||
|
||||
|
||||
# clean up the local browsertrix files
|
||||
try:
|
||||
shutil.rmtree(browsertrix_home)
|
||||
except PermissionError:
|
||||
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
|
||||
|
||||
return self.storage.get_cdn_url(key)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
self.add_to_media(cdn_url, key)
|
||||
return cdn_url
|
||||
|
||||
def get_thumbnails(self, filename, key, duration=None):
|
||||
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
|
||||
|
|
|
@ -52,7 +52,7 @@ class InstagramArchiver(Archiver):
|
|||
cdn_url = self.storage.get_cdn_url(key)
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
|
||||
return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
try:
|
||||
# process if post
|
||||
|
@ -137,4 +137,4 @@ class InstagramArchiver(Archiver):
|
|||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -47,7 +47,7 @@ class TelegramArchiver(Archiver):
|
|||
time_elements = s.find_all('time')
|
||||
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
|
||||
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
|
||||
|
||||
video_url = video.get('src')
|
||||
video_id = video_url.split('/')[-1].split('?')[0]
|
||||
|
@ -85,5 +85,5 @@ class TelegramArchiver(Archiver):
|
|||
os.remove(filename)
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -80,7 +80,7 @@ class TelethonArchiver(Archiver):
|
|||
if check_if_exists and self.storage.exists(key):
|
||||
# only s3 storage supports storage.exists as not implemented on gd
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
|
||||
return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
key_thumb, thumb_index = None, None
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
|
@ -119,7 +119,7 @@ class TelethonArchiver(Archiver):
|
|||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
|
||||
return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
|
||||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -28,9 +28,9 @@ class TiktokArchiver(Archiver):
|
|||
|
||||
if len(media) <= 0:
|
||||
if status == 'already archived':
|
||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
|
||||
return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
|
||||
else:
|
||||
return ArchiveResult(status='Could not download media')
|
||||
return self.generateArchiveResult(status='Could not download media')
|
||||
|
||||
logger.info(f'downloading video {key=}')
|
||||
media[0].download(filename)
|
||||
|
@ -56,17 +56,17 @@ class TiktokArchiver(Archiver):
|
|||
cdn_url = self.storage.get_cdn_url(key)
|
||||
timestamp = info.create.isoformat() if hasattr(info, "create") else None
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
|
||||
timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
except tiktok_downloader.Except.InvalidUrl as e:
|
||||
status = 'Invalid URL'
|
||||
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
|
||||
return ArchiveResult(status=status)
|
||||
return self.generateArchiveResult(status=status)
|
||||
|
||||
except:
|
||||
error = traceback.format_exc()
|
||||
status = 'Other Tiktok error: ' + str(error)
|
||||
logger.warning(f'Other Tiktok error' + str(error))
|
||||
return ArchiveResult(status=status)
|
||||
return self.generateArchiveResult(status=status)
|
||||
|
|
|
@ -40,7 +40,7 @@ class TwitterApiArchiver(TwitterArchiver):
|
|||
# only s3 storage supports storage.exists as not implemented on gd
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
screenshot = self.get_screenshot(url)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
|
||||
return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
|
||||
|
||||
urls = []
|
||||
if tweet.includes:
|
||||
|
@ -72,4 +72,4 @@ class TwitterApiArchiver(TwitterArchiver):
|
|||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
|
||||
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
|
||||
|
|
|
@ -41,7 +41,7 @@ class TwitterArchiver(Archiver):
|
|||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
urls = []
|
||||
|
||||
|
@ -62,7 +62,7 @@ class TwitterArchiver(Archiver):
|
|||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
|
||||
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
|
||||
|
||||
def download_alternative(self, url, tweet_id):
|
||||
# https://stackoverflow.com/a/71867055/6196010
|
||||
|
@ -87,7 +87,7 @@ class TwitterArchiver(Archiver):
|
|||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
|
||||
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
|
||||
|
||||
def choose_variant(self, variants):
|
||||
# choosing the highest quality possible
|
||||
|
|
|
@ -31,7 +31,7 @@ class VkArchiver(Archiver):
|
|||
# if check_if_exists and self.storage.exists(key):
|
||||
# screenshot = self.get_screenshot(url)
|
||||
# cdn_url = self.storage.get_cdn_url(key)
|
||||
# return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
|
||||
# return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
|
||||
|
||||
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||
if len(results) == 0:
|
||||
|
@ -71,4 +71,4 @@ class VkArchiver(Archiver):
|
|||
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
|
||||
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
|
||||
|
|
|
@ -39,7 +39,7 @@ class WaybackArchiver(Archiver):
|
|||
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||
return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
|
||||
return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
|
||||
|
||||
if 'job_id' not in r.json() and 'message' in r.json():
|
||||
return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
|
||||
|
@ -61,7 +61,7 @@ class WaybackArchiver(Archiver):
|
|||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
|
||||
return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
|
||||
|
||||
status_json = status_r.json()
|
||||
if status_json['status'] != 'success':
|
||||
|
@ -77,7 +77,7 @@ class WaybackArchiver(Archiver):
|
|||
title = 'Could not get title'
|
||||
except:
|
||||
title = "Could not get title"
|
||||
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
|
||||
self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
|
||||
return self.seen_urls[url]
|
||||
|
||||
def custom_retry(self, json_data, **kwargs):
|
||||
|
@ -86,4 +86,4 @@ class WaybackArchiver(Archiver):
|
|||
return self.signal_retry_in(**kwargs)
|
||||
if "this host has been already captured" in str(json_data).lower():
|
||||
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
|
||||
return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
|
||||
return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
|
||||
|
|
|
@ -38,7 +38,7 @@ class YoutubeDLArchiver(Archiver):
|
|||
|
||||
if info.get('is_live', False):
|
||||
logger.warning("Live streaming media, not archiving now")
|
||||
return ArchiveResult(status="Streaming media")
|
||||
return self.generateArchiveResult(status="Streaming media")
|
||||
|
||||
if 'twitter.com' in netloc:
|
||||
if 'https://twitter.com/' in info['webpage_url']:
|
||||
|
@ -114,5 +114,5 @@ class YoutubeDLArchiver(Archiver):
|
|||
elif 'upload_date' in info and info['upload_date'] is not None:
|
||||
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -57,7 +57,7 @@ def missing_required_columns(gw: GWorksheet):
|
|||
return missing
|
||||
|
||||
|
||||
def should_process_sheet(c, sheet_name):
|
||||
def should_process_sheet(c: Config, sheet_name):
|
||||
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
|
@ -67,6 +67,50 @@ def should_process_sheet(c, sheet_name):
|
|||
return True
|
||||
|
||||
|
||||
def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
|
||||
url = expand_url(url)
|
||||
c.set_folder(folder)
|
||||
storage = c.get_storage()
|
||||
|
||||
# make a new driver so each spreadsheet row is idempotent
|
||||
c.recreate_webdriver()
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c),
|
||||
TiktokArchiver(storage, c),
|
||||
TwitterApiArchiver(storage, c),
|
||||
InstagramArchiver(storage, c),
|
||||
YoutubeDLArchiver(storage, c),
|
||||
TelegramArchiver(storage, c),
|
||||
TwitterArchiver(storage, c),
|
||||
VkArchiver(storage, c),
|
||||
WaybackArchiver(storage, c)
|
||||
]
|
||||
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on {debug_string}')
|
||||
|
||||
try:
|
||||
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
||||
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
||||
except Exception as e:
|
||||
result = False
|
||||
logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||
|
||||
if result:
|
||||
success = result.status in ['success', 'already archived']
|
||||
result.status = f"{archiver.name}: {result.status}"
|
||||
if success:
|
||||
logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
|
||||
break
|
||||
# only 1 retry possible for now
|
||||
if is_retry and Archiver.is_retry(result.status):
|
||||
result.status = Archiver.remove_retry(result.status)
|
||||
logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
|
||||
return result
|
||||
|
||||
|
||||
def process_sheet(c: Config):
|
||||
sh = c.gsheets_client.open(c.sheet)
|
||||
|
||||
|
@ -100,46 +144,7 @@ def process_sheet(c: Config):
|
|||
# All checks done - archival process starts here
|
||||
try:
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
url = expand_url(url)
|
||||
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
||||
|
||||
# make a new driver so each spreadsheet row is idempotent
|
||||
c.recreate_webdriver()
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c),
|
||||
TiktokArchiver(storage, c),
|
||||
TwitterApiArchiver(storage, c),
|
||||
InstagramArchiver(storage, c),
|
||||
YoutubeDLArchiver(storage, c),
|
||||
TelegramArchiver(storage, c),
|
||||
TwitterArchiver(storage, c),
|
||||
VkArchiver(storage, c),
|
||||
WaybackArchiver(storage, c)
|
||||
]
|
||||
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on {row=}')
|
||||
|
||||
try:
|
||||
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
||||
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
||||
except Exception as e:
|
||||
result = False
|
||||
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||
|
||||
if result:
|
||||
success = result.status in ['success', 'already archived']
|
||||
result.status = f"{archiver.name}: {result.status}"
|
||||
if success:
|
||||
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
|
||||
break
|
||||
# only 1 retry possible for now
|
||||
if is_retry and Archiver.is_retry(result.status):
|
||||
result.status = Archiver.remove_retry(result.status)
|
||||
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
|
||||
|
||||
result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
|
||||
if result:
|
||||
update_sheet(gw, row, url, result)
|
||||
else:
|
||||
|
|
30
src/cli.py
30
src/cli.py
|
@ -0,0 +1,30 @@
|
|||
import tempfile, json
|
||||
import auto_archive
|
||||
from loguru import logger
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
from slugify import slugify
|
||||
|
||||
|
||||
def main():
|
||||
c = Config()
|
||||
c.parse()
|
||||
url = c.url
|
||||
if not url:
|
||||
logger.error("Invalid URL: '{url}'")
|
||||
return
|
||||
logger.info(f'Archiving "{url=}".')
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||
Storage.TMP_FOLDER = tmpdir
|
||||
result = auto_archive.archive_url(c, url, "", f"{url=}", False)
|
||||
c.destroy_webdriver()
|
||||
key = f"media_{slugify(url)}.json"
|
||||
with open(key, "w", encoding="utf-8") as outf:
|
||||
json.dump(result.media, outf, ensure_ascii=False, indent=4)
|
||||
c.get_storage().upload(key, key)
|
||||
print(result)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -47,6 +47,8 @@ class Config:
|
|||
with open(self.config_file, "r", encoding="utf-8") as inf:
|
||||
self.config = yaml.safe_load(inf)
|
||||
|
||||
self.url = getattr_or(self.args, "url", '')
|
||||
|
||||
# ----------------------EXECUTION - execution configurations
|
||||
execution = self.config.get("execution", {})
|
||||
|
||||
|
@ -211,6 +213,7 @@ class Config:
|
|||
"""
|
||||
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
|
||||
|
||||
parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction')
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
|
||||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import os, uuid
|
||||
from loguru import logger
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
@ -18,6 +19,14 @@ class Storage(ABC):
|
|||
@abstractmethod
|
||||
def uploadf(self, file, key, **kwargs): pass
|
||||
|
||||
def clean_key(self, key):
|
||||
# Some storages does not work well with trailing forward slashes and some keys come with that
|
||||
if key.startswith('/'):
|
||||
logger.debug(f'Found and fixed a leading "/" for {key=}')
|
||||
return key[1:]
|
||||
return key
|
||||
|
||||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
with open(filename, 'rb') as f:
|
||||
|
|
|
@ -116,13 +116,6 @@ class GDStorage(Storage):
|
|||
# GD only requires the filename not a file reader
|
||||
self.uploadf(filename, key, **kwargs)
|
||||
|
||||
def clean_key(self, key):
|
||||
# GDrive does not work well with trailing forward slashes and some keys come with that
|
||||
if key.startswith('/'):
|
||||
logger.debug(f'Found and fixed a leading "/" for {key=}')
|
||||
return key[1:]
|
||||
return key
|
||||
|
||||
# gets the Drive folderID if it is there
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
||||
"""
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
|
||||
from dataclasses import dataclass
|
||||
from loguru import logger
|
||||
|
||||
from .base_storage import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
@ -18,8 +19,12 @@ class LocalStorage(Storage):
|
|||
mkdir_if_not_exists(self.save_to)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
key = self.clean_key(key)
|
||||
logger.info(f"{key=}")
|
||||
full_path = os.path.join(self.save_to, self.folder, key)
|
||||
mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
|
||||
logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}")
|
||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||
# mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
|
||||
return os.path.abspath(full_path)
|
||||
|
||||
def exists(self, key):
|
||||
|
|
Ładowanie…
Reference in New Issue