Merge pull request #64 from bellingcat/dev

pull/80/head
Miguel Sozinho Ramalho 2022-10-17 14:40:15 +01:00 zatwierdzone przez GitHub
commit 683f2d7500
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
17 zmienionych plików z 175 dodań i 74 usunięć

4
.gitignore vendored
Wyświetl plik

@ -19,4 +19,6 @@ local_archive/
vk_config*.json vk_config*.json
gd-token.json gd-token.json
credentials.json credentials.json
secrets/* secrets/*
browsertrix/*
browsertrix-tmp/*

Wyświetl plik

@ -18,6 +18,8 @@ You also need:
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
in addition to screenshots you will need to install [Docker](https://www.docker.com/).
### Configuration file ### Configuration file
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:

Wyświetl plik

@ -1,4 +1,4 @@
import os, datetime, shutil, hashlib, time, requests, re, mimetypes import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
from dataclasses import dataclass from dataclasses import dataclass
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from urllib.parse import urlparse from urllib.parse import urlparse
@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from slugify import slugify from slugify import slugify
from configs import Config
from storages import Storage from storages import Storage
from utils import mkdir_if_not_exists from utils import mkdir_if_not_exists
@ -24,16 +25,18 @@ class ArchiveResult:
title: str = None title: str = None
timestamp: datetime.datetime = None timestamp: datetime.datetime = None
screenshot: str = None screenshot: str = None
wacz: str = None
hash: str = None hash: str = None
class Archiver(ABC): class Archiver(ABC):
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
name = "default" name = "default"
retry_regex = r"retrying at (\d+)$" retry_regex = r"retrying at (\d+)$"
def __init__(self, storage: Storage, driver): def __init__(self, storage: Storage, config: Config):
self.storage = storage self.storage = storage
self.driver = driver self.driver = config.webdriver
self.hash_algorithm = config.hash_algorithm
self.browsertrix = config.browsertrix_config
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
@ -162,11 +165,11 @@ class Archiver(ABC):
def get_hash(self, filename): def get_hash(self, filename):
with open(filename, "rb") as f: with open(filename, "rb") as f:
bytes = f.read() # read entire file as bytes bytes = f.read() # read entire file as bytes
logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}') logger.debug(f'Hash algorithm is {self.hash_algorithm}')
if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes) if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes) elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}") else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
return hash.hexdigest() return hash.hexdigest()
@ -195,8 +198,54 @@ class Archiver(ABC):
logger.info("TimeoutException loading page for screenshot") logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename) self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
return self.storage.get_cdn_url(key)
def get_wacz(self, url):
logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
cmd = [
"docker", "run",
"-v", f"{browsertrix_home}:/crawls/",
"-it",
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.browsertrix.timeout_seconds)
]
if not os.path.isdir(browsertrix_home):
os.mkdir(browsertrix_home)
if self.browsertrix.profile:
shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
cmd.extend(["--profile", "/crawls/profile.tar.gz"])
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
except Exception as e:
logger.error(f"WACZ generation failed: {e}")
return
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
self.storage.upload(filename, key, extra_args={ self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'}) 'ACL': 'public-read', 'ContentType': 'application/zip'})
# clean up the local browsertrix files
try:
shutil.rmtree(browsertrix_home)
except PermissionError:
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
return self.storage.get_cdn_url(key) return self.storage.get_cdn_url(key)

Wyświetl plik

@ -28,6 +28,7 @@ class TelegramArchiver(Archiver):
url += "?embed=1" url += "?embed=1"
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
t = requests.get(url, headers=headers) t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser') s = BeautifulSoup(t.content, 'html.parser')
@ -46,7 +47,7 @@ class TelegramArchiver(Archiver):
time_elements = s.find_all('time') time_elements = s.find_all('time')
timestamp = time_elements[0].get('datetime') if len(time_elements) else None timestamp = time_elements[0].get('datetime') if len(time_elements) else None
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
video_url = video.get('src') video_url = video.get('src')
video_id = video_url.split('/')[-1].split('?')[0] video_id = video_url.split('/')[-1].split('?')[0]
@ -85,4 +86,4 @@ class TelegramArchiver(Archiver):
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot) duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)

Wyświetl plik

@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError
from storages import Storage from storages import Storage
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from configs import TelethonConfig from configs import Config
from utils import getattr_or from utils import getattr_or
@ -15,11 +15,12 @@ class TelethonArchiver(Archiver):
name = "telethon" name = "telethon"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
def __init__(self, storage: Storage, driver, config: TelethonConfig): def __init__(self, storage: Storage, config: Config):
super().__init__(storage, driver) super().__init__(storage, config)
if config: if config.telegram_config:
self.client = TelegramClient("./anon", config.api_id, config.api_hash) c = config.telegram_config
self.bot_token = config.bot_token self.client = TelegramClient("./anon", c.api_id, c.api_hash)
self.bot_token = c.bot_token
def _get_media_posts_in_group(self, chat, original_post, max_amp=10): def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
""" """
@ -73,6 +74,7 @@ class TelethonArchiver(Archiver):
logger.debug(f'got {len(media_posts)=} for {url=}') logger.debug(f'got {len(media_posts)=} for {url=}')
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
if len(media_posts) > 0: if len(media_posts) > 0:
key = self.get_html_key(url) key = self.get_html_key(url)
@ -80,7 +82,7 @@ class TelethonArchiver(Archiver):
if check_if_exists and self.storage.exists(key): if check_if_exists and self.storage.exists(key):
# only s3 storage supports storage.exists as not implemented on gd # only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
key_thumb, thumb_index = None, None key_thumb, thumb_index = None, None
group_id = post.grouped_id if post.grouped_id is not None else post.id group_id = post.grouped_id if post.grouped_id is not None else post.id
@ -119,7 +121,7 @@ class TelethonArchiver(Archiver):
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index) return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)

Wyświetl plik

@ -48,6 +48,7 @@ class TiktokArchiver(Archiver):
hash = self.get_hash(filename) hash = self.get_hash(filename)
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
try: os.remove(filename) try: os.remove(filename)
except FileNotFoundError: except FileNotFoundError:
@ -57,7 +58,7 @@ class TiktokArchiver(Archiver):
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
timestamp=timestamp, hash=hash, screenshot=screenshot) timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
except tiktok_downloader.Except.InvalidUrl as e: except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL' status = 'Invalid URL'

Wyświetl plik

@ -5,7 +5,7 @@ from loguru import logger
from pytwitter import Api from pytwitter import Api
from storages.base_storage import Storage from storages.base_storage import Storage
from configs import TwitterApiConfig from configs import Config
from .base_archiver import ArchiveResult from .base_archiver import ArchiveResult
from .twitter_archiver import TwitterArchiver from .twitter_archiver import TwitterArchiver
@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver
class TwitterApiArchiver(TwitterArchiver): class TwitterApiArchiver(TwitterArchiver):
name = "twitter_api" name = "twitter_api"
def __init__(self, storage: Storage, driver, config: TwitterApiConfig): def __init__(self, storage: Storage, config: Config):
super().__init__(storage, driver) super().__init__(storage, config)
c = config.twitter_config
if config.bearer_token: if c.bearer_token:
self.api = Api(bearer_token=config.bearer_token) self.api = Api(bearer_token=c.bearer_token)
elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret: elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret:
self.api = Api( self.api = Api(
consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret) consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret)
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
if not hasattr(self, "api"): if not hasattr(self, "api"):
@ -69,5 +70,6 @@ class TwitterApiArchiver(TwitterArchiver):
}, ensure_ascii=False, indent=4) }, ensure_ascii=False, indent=4)
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)

Wyświetl plik

@ -39,8 +39,9 @@ class TwitterArchiver(Archiver):
if tweet.media is None: if tweet.media is None:
logger.debug(f'No media found, archiving tweet text only') logger.debug(f'No media found, archiving tweet text only')
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
urls = [] urls = []
@ -59,8 +60,9 @@ class TwitterArchiver(Archiver):
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
def download_alternative(self, url, tweet_id): def download_alternative(self, url, tweet_id):
# https://stackoverflow.com/a/71867055/6196010 # https://stackoverflow.com/a/71867055/6196010
@ -83,8 +85,9 @@ class TwitterArchiver(Archiver):
timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"]) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
def choose_variant(self, variants): def choose_variant(self, variants):
# choosing the highest quality possible # choosing the highest quality possible

Wyświetl plik

@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder
from storages import Storage from storages import Storage
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from configs import VkConfig from configs import Config
class VkArchiver(Archiver): class VkArchiver(Archiver):
@ -17,10 +17,10 @@ class VkArchiver(Archiver):
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
def __init__(self, storage: Storage, driver, config: VkConfig): def __init__(self, storage: Storage, config: Config):
super().__init__(storage, driver) super().__init__(storage, config)
if config != None: if config.vk_config != None:
self.vks = VkScraper(config.username, config.password) self.vks = VkScraper(config.vk_config.username, config.vk_config.password)
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
if not hasattr(self, "vks") or self.vks is None: if not hasattr(self, "vks") or self.vks is None:
@ -70,4 +70,5 @@ class VkArchiver(Archiver):
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail) page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st # # if multiple wall/photos/videos are present the screenshot will only grab the 1st
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title) wacz = self.get_wacz(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)

Wyświetl plik

@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
from storages import Storage from storages import Storage
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from configs import WaybackConfig from configs import Config
class WaybackArchiver(Archiver): class WaybackArchiver(Archiver):
@ -15,9 +15,9 @@ class WaybackArchiver(Archiver):
""" """
name = "wayback" name = "wayback"
def __init__(self, storage: Storage, driver, config: WaybackConfig): def __init__(self, storage: Storage, config: Config):
super(WaybackArchiver, self).__init__(storage, driver) super(WaybackArchiver, self).__init__(storage, config)
self.config = config self.config = config.wayback_config
self.seen_urls = {} self.seen_urls = {}
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
@ -28,6 +28,8 @@ class WaybackArchiver(Archiver):
if url in self.seen_urls: return self.seen_urls[url] if url in self.seen_urls: return self.seen_urls[url]
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
logger.debug(f"POSTing {url=} to web.archive.org") logger.debug(f"POSTing {url=} to web.archive.org")
ia_headers = { ia_headers = {
"Accept": "application/json", "Accept": "application/json",
@ -37,10 +39,10 @@ class WaybackArchiver(Archiver):
if r.status_code != 200: if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}") logger.warning(f"Internet archive failed with status of {r.status_code}")
return ArchiveResult(status="Internet archive failed", screenshot=screenshot) return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
if 'job_id' not in r.json() and 'message' in r.json(): if 'job_id' not in r.json() and 'message' in r.json():
return self.custom_retry(r.json(), screenshot=screenshot) return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
job_id = r.json()['job_id'] job_id = r.json()['job_id']
logger.debug(f"GETting status for {job_id=} on {url=}") logger.debug(f"GETting status for {job_id=} on {url=}")
@ -59,11 +61,11 @@ class WaybackArchiver(Archiver):
retries += 1 retries += 1
if status_r.status_code != 200: if status_r.status_code != 200:
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot) return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
status_json = status_r.json() status_json = status_r.json()
if status_json['status'] != 'success': if status_json['status'] != 'success':
return self.custom_retry(status_json, screenshot=screenshot) return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz)
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
@ -75,8 +77,7 @@ class WaybackArchiver(Archiver):
title = 'Could not get title' title = 'Could not get title'
except: except:
title = "Could not get title" title = "Could not get title"
screenshot = self.get_screenshot(url) self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
return self.seen_urls[url] return self.seen_urls[url]
def custom_retry(self, json_data, **kwargs): def custom_retry(self, json_data, **kwargs):

Wyświetl plik

@ -6,15 +6,16 @@ from loguru import logger
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from storages import Storage from storages import Storage
from configs import Config
class YoutubeDLArchiver(Archiver): class YoutubeDLArchiver(Archiver):
name = "youtube_dl" name = "youtube_dl"
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
def __init__(self, storage: Storage, driver, fb_cookie): def __init__(self, storage: Storage, config: Config):
super().__init__(storage, driver) super().__init__(storage, config)
self.fb_cookie = fb_cookie self.fb_cookie = config.facebook_cookie
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url) netloc = self.get_netloc(url)
@ -93,6 +94,7 @@ class YoutubeDLArchiver(Archiver):
hash = self.get_hash(filename) hash = self.get_hash(filename)
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
# get duration # get duration
duration = info.get('duration') duration = info.get('duration')
@ -113,4 +115,4 @@ class YoutubeDLArchiver(Archiver):
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)

Wyświetl plik

@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile
from loguru import logger from loguru import logger
from slugify import slugify from slugify import slugify
from urllib.parse import quote
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
from utils import GWorksheet, mkdir_if_not_exists, expand_url from utils import GWorksheet, mkdir_if_not_exists, expand_url
@ -11,7 +12,7 @@ from storages import Storage
random.seed() random.seed()
def update_sheet(gw, row, result: ArchiveResult): def update_sheet(gw, row, url, result: ArchiveResult):
cell_updates = [] cell_updates = []
row_values = gw.get_row(row) row_values = gw.get_row(row)
@ -30,6 +31,8 @@ def update_sheet(gw, row, result: ArchiveResult):
batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('duration', result.duration, str(result.duration))
batch_if_valid('screenshot', result.screenshot) batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash) batch_if_valid('hash', result.hash)
batch_if_valid('wacz', result.wacz)
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
if result.timestamp is not None: if result.timestamp is not None:
if type(result.timestamp) == int: if type(result.timestamp) == int:
@ -104,14 +107,14 @@ def process_sheet(c: Config):
# order matters, first to succeed excludes remaining # order matters, first to succeed excludes remaining
active_archivers = [ active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config), TelethonArchiver(storage, c),
TiktokArchiver(storage, c.webdriver), TiktokArchiver(storage, c),
TwitterApiArchiver(storage, c.webdriver, c.twitter_config), TwitterApiArchiver(storage, c),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), YoutubeDLArchiver(storage, c),
TelegramArchiver(storage, c.webdriver), TelegramArchiver(storage, c),
TwitterArchiver(storage, c.webdriver), TwitterArchiver(storage, c),
VkArchiver(storage, c.webdriver, c.vk_config), VkArchiver(storage, c),
WaybackArchiver(storage, c.webdriver, c.wayback_config) WaybackArchiver(storage, c)
] ]
for archiver in active_archivers: for archiver in active_archivers:
@ -136,7 +139,7 @@ def process_sheet(c: Config):
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
if result: if result:
update_sheet(gw, row, result) update_sheet(gw, row, url, result)
else: else:
gw.set_cell(row, 'status', 'failed: no archiver') gw.set_cell(row, 'status', 'failed: no archiver')
except KeyboardInterrupt: except KeyboardInterrupt:

Wyświetl plik

@ -0,0 +1,6 @@
from dataclasses import dataclass
@dataclass
class BrowsertrixConfig:
profile: str
timeout_seconds: str

Wyświetl plik

@ -1,6 +1,4 @@
import argparse, yaml, json, os
import argparse, yaml, json
from archivers.base_archiver import Archiver
import gspread import gspread
from loguru import logger from loguru import logger
from selenium import webdriver from selenium import webdriver
@ -13,6 +11,7 @@ from .telethon_config import TelethonConfig
from .selenium_config import SeleniumConfig from .selenium_config import SeleniumConfig
from .vk_config import VkConfig from .vk_config import VkConfig
from .twitter_api_config import TwitterApiConfig from .twitter_api_config import TwitterApiConfig
from .browsertrix_config import BrowsertrixConfig
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
@ -82,7 +81,16 @@ class Config:
) )
self.webdriver = "not initialized" self.webdriver = "not initialized"
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM) # browsertrix config
browsertrix_configs = execution.get("browsertrix", {})
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
browsertrix_profile = os.path.abspath(browsertrix_profile)
self.browsertrix_config = BrowsertrixConfig(
profile=browsertrix_profile,
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
)
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
# ---------------------- SECRETS - APIs and service configurations # ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {}) secrets = self.config.get("secrets", {})
@ -208,6 +216,7 @@ class Config:
update the folder in each of the storages update the folder in each of the storages
""" """
self.folder = folder self.folder = folder
logger.info(f"setting folder to {folder}")
# s3 # s3
if hasattr(self, "s3_config"): self.s3_config.folder = folder if hasattr(self, "s3_config"): self.s3_config.folder = folder
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
@ -263,7 +272,8 @@ class Config:
"storage": self.storage, "storage": self.storage,
"header": self.header, "header": self.header,
"check_if_exists": self.check_if_exists, "check_if_exists": self.check_if_exists,
"hash_algorithm": Archiver.HASH_ALGORITHM, "hash_algorithm": self.hash_algorithm,
"browsertrix_config": asdict(self.browsertrix_config),
"save_logs": self.save_logs, "save_logs": self.save_logs,
"selenium_config": asdict(self.selenium_config), "selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None, "selenium_webdriver": self.webdriver != None,

Wyświetl plik

@ -9,6 +9,7 @@ secrets:
secret: "s3 API secret" secret: "s3 API secret"
# use region format like such # use region format like such
endpoint_url: "https://{region}.digitaloceanspaces.com" endpoint_url: "https://{region}.digitaloceanspaces.com"
# endpoint_url: "https://s3.{region}.amazonaws.com"
#use bucket, region, and key (key is the archived file path generated when executing) format like such as: #use bucket, region, and key (key is the archived file path generated when executing) format like such as:
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online # if private:true S3 urls will not be readable online
@ -101,6 +102,11 @@ execution:
timeout_seconds: 120 timeout_seconds: 120
window_width: 1400 window_width: 1400
window_height: 2000 window_height: 2000
# optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
browsertrix:
profile: "./browsertrix/crawls/profile.tar.gz"
timeout_seconds: 90 # defaults to 90s
# puts execution logs into /logs folder, defaults to false # puts execution logs into /logs folder, defaults to false
save_logs: true save_logs: true
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
@ -119,4 +125,7 @@ execution:
duration: duration duration: duration
screenshot: screenshot screenshot: screenshot
hash: hash hash: hash
wacz: wacz
# if you want the replaypage to work, make sure to allow CORS on your bucket
replaywebpage: replaywebpage

Wyświetl plik

@ -67,9 +67,14 @@ class S3Storage(Storage):
return False return False
def uploadf(self, file, key, **kwargs): def uploadf(self, file, key, **kwargs):
if self.private: extra_args = kwargs.get("extra_args", {})
extra_args = kwargs.get("extra_args", {}) if not self.private and 'ACL' not in extra_args:
else: extra_args['ACL'] = 'public-read'
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
extra_args['ContentType'] = mimetypes.guess_type(key)[0] if 'ContentType' not in extra_args:
try:
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
except Exception as e:
logger.error(f"Unable to get mimetype for {key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

Wyświetl plik

@ -20,7 +20,9 @@ class GWorksheet:
'title': 'upload title', 'title': 'upload title',
'duration': 'duration', 'duration': 'duration',
'screenshot': 'screenshot', 'screenshot': 'screenshot',
'hash': 'hash' 'hash': 'hash',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
} }
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):