kopia lustrzana https://github.com/bellingcat/auto-archiver
WIP docker changes for cli and auto_archiver
rodzic
390b84eb22
commit
04263094ad
|
@ -18,16 +18,17 @@ RUN pip install --upgrade pip && \
|
||||||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||||
# RUN curl -fsSL https://get.docker.com | sh
|
# RUN curl -fsSL https://get.docker.com | sh
|
||||||
|
|
||||||
# RUN git clone https://github.com/bellingcat/auto-archiver
|
|
||||||
# TODO: avoid copying unnecessary files, including .git
|
# TODO: avoid copying unnecessary files, including .git
|
||||||
COPY Pipfile Pipfile.lock ./
|
COPY Pipfile Pipfile.lock ./
|
||||||
RUN pipenv install --python=3.10 --system --deploy
|
RUN pipenv install --python=3.10 --system --deploy
|
||||||
ENV IS_DOCKER=1
|
ENV IS_DOCKER=1
|
||||||
COPY ./src/ .
|
COPY ./src/ .
|
||||||
|
|
||||||
# CMD ["pipenv", "run", "python", "auto_archive.py"]
|
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||||
ENTRYPOINT ["python", "auto_archive.py"]
|
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||||
|
# USER archiver
|
||||||
|
ENTRYPOINT ["python"]
|
||||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
# ENTRYPOINT ["docker-entrypoint.sh"]
|
||||||
|
|
||||||
# should be executed with 2 volumes
|
# should be executed with 2 volumes (3 if local_storage)
|
||||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
|
@ -1,8 +1,9 @@
|
||||||
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
|
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from random import randrange
|
from random import randrange
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
@ -27,6 +28,7 @@ class ArchiveResult:
|
||||||
screenshot: str = None
|
screenshot: str = None
|
||||||
wacz: str = None
|
wacz: str = None
|
||||||
hash: str = None
|
hash: str = None
|
||||||
|
media: list = field(default_factory=list)
|
||||||
|
|
||||||
class Archiver(ABC):
|
class Archiver(ABC):
|
||||||
name = "default"
|
name = "default"
|
||||||
|
@ -38,6 +40,7 @@ class Archiver(ABC):
|
||||||
self.hash_algorithm = config.hash_algorithm
|
self.hash_algorithm = config.hash_algorithm
|
||||||
self.browsertrix = config.browsertrix_config
|
self.browsertrix = config.browsertrix_config
|
||||||
self.is_docker = config.is_docker
|
self.is_docker = config.is_docker
|
||||||
|
self.media = []
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
@ -48,13 +51,28 @@ class Archiver(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def download(self, url, check_if_exists=False): pass
|
def download(self, url, check_if_exists=False): pass
|
||||||
|
|
||||||
|
def generateArchiveResult(self, **kwargs):
|
||||||
|
# remove duplicates
|
||||||
|
if "cdn_url" in kwargs:
|
||||||
|
self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
|
||||||
|
kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
|
||||||
|
return ArchiveResult(**kwargs)
|
||||||
|
|
||||||
def get_netloc(self, url):
|
def get_netloc(self, url):
|
||||||
return urlparse(url).netloc
|
return urlparse(url).netloc
|
||||||
|
|
||||||
|
def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
|
||||||
|
media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
|
||||||
|
if key: media_info["key"] = key
|
||||||
|
if hash: media_info["hash"] = hash
|
||||||
|
self.media.append(media_info)
|
||||||
|
|
||||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||||
"""
|
"""
|
||||||
Generates an index.html page where each @urls_info is displayed
|
Generates an index.html page where each @urls_info is displayed
|
||||||
"""
|
"""
|
||||||
|
for ui in urls_info:
|
||||||
|
self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
|
||||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||||
<body>
|
<body>
|
||||||
<h2>Archived media from {self.name}</h2>
|
<h2>Archived media from {self.name}</h2>
|
||||||
|
@ -109,6 +127,8 @@ class Archiver(ABC):
|
||||||
For a list of media urls, fetch them, upload them
|
For a list of media urls, fetch them, upload them
|
||||||
and call self.generate_media_page_html with them
|
and call self.generate_media_page_html with them
|
||||||
"""
|
"""
|
||||||
|
for media_url in urls:
|
||||||
|
self.add_to_media(media_url)
|
||||||
|
|
||||||
thumbnail = None
|
thumbnail = None
|
||||||
uploaded_media = []
|
uploaded_media = []
|
||||||
|
@ -201,17 +221,20 @@ class Archiver(ABC):
|
||||||
self.driver.save_screenshot(filename)
|
self.driver.save_screenshot(filename)
|
||||||
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
|
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||||
|
|
||||||
return self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
self.add_to_media(cdn_url, key)
|
||||||
|
|
||||||
|
return cdn_url
|
||||||
|
|
||||||
def get_wacz(self, url):
|
def get_wacz(self, url):
|
||||||
if not self.browsertrix.enabled:
|
if not self.browsertrix.enabled:
|
||||||
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
|
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
|
||||||
return
|
return
|
||||||
if self.is_docker:
|
if self.is_docker:
|
||||||
# TODO: figure out support for browsertrix in docker
|
# TODO: figure out support for browsertrix in docker
|
||||||
# see: https://github.com/bellingcat/auto-archiver/issues/66
|
# see: https://github.com/bellingcat/auto-archiver/issues/66
|
||||||
logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
|
logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.debug(f"getting wacz for {url}")
|
logger.debug(f"getting wacz for {url}")
|
||||||
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
||||||
|
@ -220,7 +243,7 @@ class Archiver(ABC):
|
||||||
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
|
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
|
||||||
cmd = [
|
cmd = [
|
||||||
"docker", "run",
|
"docker", "run",
|
||||||
"--rm", # delete container once it has completed running
|
"--rm", # delete container once it has completed running
|
||||||
"-v", f"{browsertrix_home}:/crawls/",
|
"-v", f"{browsertrix_home}:/crawls/",
|
||||||
# "-it", # this leads to "the input device is not a TTY"
|
# "-it", # this leads to "the input device is not a TTY"
|
||||||
"webrecorder/browsertrix-crawler", "crawl",
|
"webrecorder/browsertrix-crawler", "crawl",
|
||||||
|
@ -253,18 +276,19 @@ class Archiver(ABC):
|
||||||
# do not crash if upload fails
|
# do not crash if upload fails
|
||||||
try:
|
try:
|
||||||
self.storage.upload(filename, key, extra_args={
|
self.storage.upload(filename, key, extra_args={
|
||||||
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError as e:
|
||||||
logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
|
logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
|
||||||
|
|
||||||
|
|
||||||
# clean up the local browsertrix files
|
# clean up the local browsertrix files
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(browsertrix_home)
|
shutil.rmtree(browsertrix_home)
|
||||||
except PermissionError:
|
except PermissionError:
|
||||||
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
|
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
|
||||||
|
|
||||||
return self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
self.add_to_media(cdn_url, key)
|
||||||
|
return cdn_url
|
||||||
|
|
||||||
def get_thumbnails(self, filename, key, duration=None):
|
def get_thumbnails(self, filename, key, duration=None):
|
||||||
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
|
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
|
||||||
|
|
|
@ -52,7 +52,7 @@ class InstagramArchiver(Archiver):
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
wacz = self.get_wacz(url)
|
wacz = self.get_wacz(url)
|
||||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
|
return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# process if post
|
# process if post
|
||||||
|
@ -137,4 +137,4 @@ class InstagramArchiver(Archiver):
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
wacz = self.get_wacz(url)
|
wacz = self.get_wacz(url)
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
|
@ -47,7 +47,7 @@ class TelegramArchiver(Archiver):
|
||||||
time_elements = s.find_all('time')
|
time_elements = s.find_all('time')
|
||||||
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
||||||
|
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
|
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
|
||||||
|
|
||||||
video_url = video.get('src')
|
video_url = video.get('src')
|
||||||
video_id = video_url.split('/')[-1].split('?')[0]
|
video_id = video_url.split('/')[-1].split('?')[0]
|
||||||
|
@ -85,5 +85,5 @@ class TelegramArchiver(Archiver):
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
|
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
|
@ -80,7 +80,7 @@ class TelethonArchiver(Archiver):
|
||||||
if check_if_exists and self.storage.exists(key):
|
if check_if_exists and self.storage.exists(key):
|
||||||
# only s3 storage supports storage.exists as not implemented on gd
|
# only s3 storage supports storage.exists as not implemented on gd
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
|
return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
||||||
key_thumb, thumb_index = None, None
|
key_thumb, thumb_index = None, None
|
||||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||||
|
@ -119,7 +119,7 @@ class TelethonArchiver(Archiver):
|
||||||
|
|
||||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
|
return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
|
||||||
|
|
||||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
||||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
|
return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
|
@ -28,9 +28,9 @@ class TiktokArchiver(Archiver):
|
||||||
|
|
||||||
if len(media) <= 0:
|
if len(media) <= 0:
|
||||||
if status == 'already archived':
|
if status == 'already archived':
|
||||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
|
return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
|
||||||
else:
|
else:
|
||||||
return ArchiveResult(status='Could not download media')
|
return self.generateArchiveResult(status='Could not download media')
|
||||||
|
|
||||||
logger.info(f'downloading video {key=}')
|
logger.info(f'downloading video {key=}')
|
||||||
media[0].download(filename)
|
media[0].download(filename)
|
||||||
|
@ -56,17 +56,17 @@ class TiktokArchiver(Archiver):
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
timestamp = info.create.isoformat() if hasattr(info, "create") else None
|
timestamp = info.create.isoformat() if hasattr(info, "create") else None
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||||
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
|
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
|
||||||
timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
||||||
except tiktok_downloader.Except.InvalidUrl as e:
|
except tiktok_downloader.Except.InvalidUrl as e:
|
||||||
status = 'Invalid URL'
|
status = 'Invalid URL'
|
||||||
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
|
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
|
||||||
return ArchiveResult(status=status)
|
return self.generateArchiveResult(status=status)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
error = traceback.format_exc()
|
error = traceback.format_exc()
|
||||||
status = 'Other Tiktok error: ' + str(error)
|
status = 'Other Tiktok error: ' + str(error)
|
||||||
logger.warning(f'Other Tiktok error' + str(error))
|
logger.warning(f'Other Tiktok error' + str(error))
|
||||||
return ArchiveResult(status=status)
|
return self.generateArchiveResult(status=status)
|
||||||
|
|
|
@ -40,7 +40,7 @@ class TwitterApiArchiver(TwitterArchiver):
|
||||||
# only s3 storage supports storage.exists as not implemented on gd
|
# only s3 storage supports storage.exists as not implemented on gd
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
|
return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
if tweet.includes:
|
if tweet.includes:
|
||||||
|
@ -72,4 +72,4 @@ class TwitterApiArchiver(TwitterArchiver):
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
wacz = self.get_wacz(url)
|
wacz = self.get_wacz(url)
|
||||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
|
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
|
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
|
||||||
|
|
|
@ -41,7 +41,7 @@ class TwitterArchiver(Archiver):
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
wacz = self.get_wacz(url)
|
wacz = self.get_wacz(url)
|
||||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
|
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
|
@ -62,7 +62,7 @@ class TwitterArchiver(Archiver):
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
wacz = self.get_wacz(url)
|
wacz = self.get_wacz(url)
|
||||||
|
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
|
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
|
||||||
|
|
||||||
def download_alternative(self, url, tweet_id):
|
def download_alternative(self, url, tweet_id):
|
||||||
# https://stackoverflow.com/a/71867055/6196010
|
# https://stackoverflow.com/a/71867055/6196010
|
||||||
|
@ -87,7 +87,7 @@ class TwitterArchiver(Archiver):
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
wacz = self.get_wacz(url)
|
wacz = self.get_wacz(url)
|
||||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
|
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
|
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
|
||||||
|
|
||||||
def choose_variant(self, variants):
|
def choose_variant(self, variants):
|
||||||
# choosing the highest quality possible
|
# choosing the highest quality possible
|
||||||
|
|
|
@ -31,7 +31,7 @@ class VkArchiver(Archiver):
|
||||||
# if check_if_exists and self.storage.exists(key):
|
# if check_if_exists and self.storage.exists(key):
|
||||||
# screenshot = self.get_screenshot(url)
|
# screenshot = self.get_screenshot(url)
|
||||||
# cdn_url = self.storage.get_cdn_url(key)
|
# cdn_url = self.storage.get_cdn_url(key)
|
||||||
# return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
|
# return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
|
||||||
|
|
||||||
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
|
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||||
if len(results) == 0:
|
if len(results) == 0:
|
||||||
|
@ -71,4 +71,4 @@ class VkArchiver(Archiver):
|
||||||
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
wacz = self.get_wacz(url)
|
wacz = self.get_wacz(url)
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
|
return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
|
||||||
|
|
|
@ -39,7 +39,7 @@ class WaybackArchiver(Archiver):
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||||
return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
|
return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
|
||||||
|
|
||||||
if 'job_id' not in r.json() and 'message' in r.json():
|
if 'job_id' not in r.json() and 'message' in r.json():
|
||||||
return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
|
return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
|
||||||
|
@ -61,7 +61,7 @@ class WaybackArchiver(Archiver):
|
||||||
retries += 1
|
retries += 1
|
||||||
|
|
||||||
if status_r.status_code != 200:
|
if status_r.status_code != 200:
|
||||||
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
|
return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
|
||||||
|
|
||||||
status_json = status_r.json()
|
status_json = status_r.json()
|
||||||
if status_json['status'] != 'success':
|
if status_json['status'] != 'success':
|
||||||
|
@ -77,7 +77,7 @@ class WaybackArchiver(Archiver):
|
||||||
title = 'Could not get title'
|
title = 'Could not get title'
|
||||||
except:
|
except:
|
||||||
title = "Could not get title"
|
title = "Could not get title"
|
||||||
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
|
self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
|
||||||
return self.seen_urls[url]
|
return self.seen_urls[url]
|
||||||
|
|
||||||
def custom_retry(self, json_data, **kwargs):
|
def custom_retry(self, json_data, **kwargs):
|
||||||
|
@ -86,4 +86,4 @@ class WaybackArchiver(Archiver):
|
||||||
return self.signal_retry_in(**kwargs)
|
return self.signal_retry_in(**kwargs)
|
||||||
if "this host has been already captured" in str(json_data).lower():
|
if "this host has been already captured" in str(json_data).lower():
|
||||||
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
|
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
|
||||||
return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
|
return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
|
||||||
|
|
|
@ -38,7 +38,7 @@ class YoutubeDLArchiver(Archiver):
|
||||||
|
|
||||||
if info.get('is_live', False):
|
if info.get('is_live', False):
|
||||||
logger.warning("Live streaming media, not archiving now")
|
logger.warning("Live streaming media, not archiving now")
|
||||||
return ArchiveResult(status="Streaming media")
|
return self.generateArchiveResult(status="Streaming media")
|
||||||
|
|
||||||
if 'twitter.com' in netloc:
|
if 'twitter.com' in netloc:
|
||||||
if 'https://twitter.com/' in info['webpage_url']:
|
if 'https://twitter.com/' in info['webpage_url']:
|
||||||
|
@ -114,5 +114,5 @@ class YoutubeDLArchiver(Archiver):
|
||||||
elif 'upload_date' in info and info['upload_date'] is not None:
|
elif 'upload_date' in info and info['upload_date'] is not None:
|
||||||
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||||
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
||||||
|
|
|
@ -57,7 +57,7 @@ def missing_required_columns(gw: GWorksheet):
|
||||||
return missing
|
return missing
|
||||||
|
|
||||||
|
|
||||||
def should_process_sheet(c, sheet_name):
|
def should_process_sheet(c: Config, sheet_name):
|
||||||
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
|
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
|
||||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||||
return False
|
return False
|
||||||
|
@ -67,6 +67,50 @@ def should_process_sheet(c, sheet_name):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
|
||||||
|
url = expand_url(url)
|
||||||
|
c.set_folder(folder)
|
||||||
|
storage = c.get_storage()
|
||||||
|
|
||||||
|
# make a new driver so each spreadsheet row is idempotent
|
||||||
|
c.recreate_webdriver()
|
||||||
|
|
||||||
|
# order matters, first to succeed excludes remaining
|
||||||
|
active_archivers = [
|
||||||
|
TelethonArchiver(storage, c),
|
||||||
|
TiktokArchiver(storage, c),
|
||||||
|
TwitterApiArchiver(storage, c),
|
||||||
|
InstagramArchiver(storage, c),
|
||||||
|
YoutubeDLArchiver(storage, c),
|
||||||
|
TelegramArchiver(storage, c),
|
||||||
|
TwitterArchiver(storage, c),
|
||||||
|
VkArchiver(storage, c),
|
||||||
|
WaybackArchiver(storage, c)
|
||||||
|
]
|
||||||
|
|
||||||
|
for archiver in active_archivers:
|
||||||
|
logger.debug(f'Trying {archiver} on {debug_string}')
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
||||||
|
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
||||||
|
except Exception as e:
|
||||||
|
result = False
|
||||||
|
logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||||
|
|
||||||
|
if result:
|
||||||
|
success = result.status in ['success', 'already archived']
|
||||||
|
result.status = f"{archiver.name}: {result.status}"
|
||||||
|
if success:
|
||||||
|
logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
|
||||||
|
break
|
||||||
|
# only 1 retry possible for now
|
||||||
|
if is_retry and Archiver.is_retry(result.status):
|
||||||
|
result.status = Archiver.remove_retry(result.status)
|
||||||
|
logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def process_sheet(c: Config):
|
def process_sheet(c: Config):
|
||||||
sh = c.gsheets_client.open(c.sheet)
|
sh = c.gsheets_client.open(c.sheet)
|
||||||
|
|
||||||
|
@ -100,46 +144,7 @@ def process_sheet(c: Config):
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
try:
|
try:
|
||||||
gw.set_cell(row, 'status', 'Archive in progress')
|
gw.set_cell(row, 'status', 'Archive in progress')
|
||||||
url = expand_url(url)
|
result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
|
||||||
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
|
||||||
|
|
||||||
# make a new driver so each spreadsheet row is idempotent
|
|
||||||
c.recreate_webdriver()
|
|
||||||
|
|
||||||
# order matters, first to succeed excludes remaining
|
|
||||||
active_archivers = [
|
|
||||||
TelethonArchiver(storage, c),
|
|
||||||
TiktokArchiver(storage, c),
|
|
||||||
TwitterApiArchiver(storage, c),
|
|
||||||
InstagramArchiver(storage, c),
|
|
||||||
YoutubeDLArchiver(storage, c),
|
|
||||||
TelegramArchiver(storage, c),
|
|
||||||
TwitterArchiver(storage, c),
|
|
||||||
VkArchiver(storage, c),
|
|
||||||
WaybackArchiver(storage, c)
|
|
||||||
]
|
|
||||||
|
|
||||||
for archiver in active_archivers:
|
|
||||||
logger.debug(f'Trying {archiver} on {row=}')
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
|
||||||
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
|
||||||
except Exception as e:
|
|
||||||
result = False
|
|
||||||
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
|
||||||
|
|
||||||
if result:
|
|
||||||
success = result.status in ['success', 'already archived']
|
|
||||||
result.status = f"{archiver.name}: {result.status}"
|
|
||||||
if success:
|
|
||||||
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
|
|
||||||
break
|
|
||||||
# only 1 retry possible for now
|
|
||||||
if is_retry and Archiver.is_retry(result.status):
|
|
||||||
result.status = Archiver.remove_retry(result.status)
|
|
||||||
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
|
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
update_sheet(gw, row, url, result)
|
update_sheet(gw, row, url, result)
|
||||||
else:
|
else:
|
||||||
|
|
30
src/cli.py
30
src/cli.py
|
@ -0,0 +1,30 @@
|
||||||
|
import tempfile, json
|
||||||
|
import auto_archive
|
||||||
|
from loguru import logger
|
||||||
|
from configs import Config
|
||||||
|
from storages import Storage
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
c = Config()
|
||||||
|
c.parse()
|
||||||
|
url = c.url
|
||||||
|
if not url:
|
||||||
|
logger.error("Invalid URL: '{url}'")
|
||||||
|
return
|
||||||
|
logger.info(f'Archiving "{url=}".')
|
||||||
|
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||||
|
Storage.TMP_FOLDER = tmpdir
|
||||||
|
result = auto_archive.archive_url(c, url, "", f"{url=}", False)
|
||||||
|
c.destroy_webdriver()
|
||||||
|
key = f"media_{slugify(url)}.json"
|
||||||
|
with open(key, "w", encoding="utf-8") as outf:
|
||||||
|
json.dump(result.media, outf, ensure_ascii=False, indent=4)
|
||||||
|
c.get_storage().upload(key, key)
|
||||||
|
print(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -47,6 +47,8 @@ class Config:
|
||||||
with open(self.config_file, "r", encoding="utf-8") as inf:
|
with open(self.config_file, "r", encoding="utf-8") as inf:
|
||||||
self.config = yaml.safe_load(inf)
|
self.config = yaml.safe_load(inf)
|
||||||
|
|
||||||
|
self.url = getattr_or(self.args, "url", '')
|
||||||
|
|
||||||
# ----------------------EXECUTION - execution configurations
|
# ----------------------EXECUTION - execution configurations
|
||||||
execution = self.config.get("execution", {})
|
execution = self.config.get("execution", {})
|
||||||
|
|
||||||
|
@ -211,6 +213,7 @@ class Config:
|
||||||
"""
|
"""
|
||||||
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
|
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
|
||||||
|
|
||||||
|
parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction')
|
||||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||||
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
|
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
|
||||||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
|
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import os, uuid
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -18,6 +19,14 @@ class Storage(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def uploadf(self, file, key, **kwargs): pass
|
def uploadf(self, file, key, **kwargs): pass
|
||||||
|
|
||||||
|
def clean_key(self, key):
|
||||||
|
# Some storages does not work well with trailing forward slashes and some keys come with that
|
||||||
|
if key.startswith('/'):
|
||||||
|
logger.debug(f'Found and fixed a leading "/" for {key=}')
|
||||||
|
return key[1:]
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
def upload(self, filename: str, key: str, **kwargs):
|
def upload(self, filename: str, key: str, **kwargs):
|
||||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
|
|
|
@ -116,13 +116,6 @@ class GDStorage(Storage):
|
||||||
# GD only requires the filename not a file reader
|
# GD only requires the filename not a file reader
|
||||||
self.uploadf(filename, key, **kwargs)
|
self.uploadf(filename, key, **kwargs)
|
||||||
|
|
||||||
def clean_key(self, key):
|
|
||||||
# GDrive does not work well with trailing forward slashes and some keys come with that
|
|
||||||
if key.startswith('/'):
|
|
||||||
logger.debug(f'Found and fixed a leading "/" for {key=}')
|
|
||||||
return key[1:]
|
|
||||||
return key
|
|
||||||
|
|
||||||
# gets the Drive folderID if it is there
|
# gets the Drive folderID if it is there
|
||||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
from .base_storage import Storage
|
from .base_storage import Storage
|
||||||
from utils import mkdir_if_not_exists
|
from utils import mkdir_if_not_exists
|
||||||
|
@ -18,8 +19,12 @@ class LocalStorage(Storage):
|
||||||
mkdir_if_not_exists(self.save_to)
|
mkdir_if_not_exists(self.save_to)
|
||||||
|
|
||||||
def get_cdn_url(self, key):
|
def get_cdn_url(self, key):
|
||||||
|
key = self.clean_key(key)
|
||||||
|
logger.info(f"{key=}")
|
||||||
full_path = os.path.join(self.save_to, self.folder, key)
|
full_path = os.path.join(self.save_to, self.folder, key)
|
||||||
mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
|
logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}")
|
||||||
|
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||||
|
# mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
|
||||||
return os.path.abspath(full_path)
|
return os.path.abspath(full_path)
|
||||||
|
|
||||||
def exists(self, key):
|
def exists(self, key):
|
||||||
|
|
Ładowanie…
Reference in New Issue