diff --git a/.gitignore b/.gitignore index 8da75c3..4d19b9e 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,6 @@ local_archive/ vk_config*.json gd-token.json credentials.json -secrets/* \ No newline at end of file +secrets/* +browsertrix/* +browsertrix-tmp/* \ No newline at end of file diff --git a/README.md b/README.md index ca5e06a..9e77d19 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ You also need: 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. +6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) + in addition to screenshots you will need to install [Docker](https://www.docker.com/). ### Configuration file Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 902f626..82d705a 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re, mimetypes +import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from slugify import slugify +from configs import Config from storages import Storage from utils import mkdir_if_not_exists @@ -24,16 +25,18 @@ class ArchiveResult: title: str = None timestamp: datetime.datetime = None screenshot: str = None + wacz: str = None hash: str = None class Archiver(ABC): - HASH_ALGORITHM="SHA-256" # can be overwritten by user configs name = "default" retry_regex = r"retrying at (\d+)$" - def __init__(self, storage: Storage, driver): + def __init__(self, storage: Storage, config: Config): self.storage = storage - self.driver = driver + self.driver = config.webdriver + self.hash_algorithm = config.hash_algorithm + self.browsertrix = config.browsertrix_config def __str__(self): return self.__class__.__name__ @@ -162,11 +165,11 @@ class Archiver(ABC): def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes - logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}') + logger.debug(f'Hash algorithm is {self.hash_algorithm}') - if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes) - elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes) - else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}") + if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes) + elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes) + else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}") return hash.hexdigest() @@ -195,8 +198,54 @@ class Archiver(ABC): logger.info("TimeoutException loading page for screenshot") self.driver.save_screenshot(filename) + self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'}) + + return self.storage.get_cdn_url(key) + + def get_wacz(self, url): + logger.debug(f"getting wacz for {url}") + key = self._get_key_from_url(url, ".wacz", append_datetime=True) + collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", "")) + + browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp") + cmd = [ + "docker", "run", + "-v", f"{browsertrix_home}:/crawls/", + "-it", + "webrecorder/browsertrix-crawler", "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", str(self.browsertrix.timeout_seconds) + ] + + if not os.path.isdir(browsertrix_home): + os.mkdir(browsertrix_home) + + if self.browsertrix.profile: + shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz")) + cmd.extend(["--profile", "/crawls/profile.tar.gz"]) + + try: + logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + except Exception as e: + logger.error(f"WACZ generation failed: {e}") + return + + filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") + self.storage.upload(filename, key, extra_args={ - 'ACL': 'public-read', 'ContentType': 'image/png'}) + 'ACL': 'public-read', 'ContentType': 'application/zip'}) + + # clean up the local browsertrix files + try: + shutil.rmtree(browsertrix_home) + except PermissionError: + logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}") return self.storage.get_cdn_url(key) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 0b6e777..026bdd0 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -28,6 +28,7 @@ class TelegramArchiver(Archiver): url += "?embed=1" screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) t = requests.get(url, headers=headers) s = BeautifulSoup(t.content, 'html.parser') @@ -46,7 +47,7 @@ class TelegramArchiver(Archiver): time_elements = s.find_all('time') timestamp = time_elements[0].get('datetime') if len(time_elements) else None - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) video_url = video.get('src') video_id = video_url.split('/')[-1].split('?')[0] @@ -85,4 +86,4 @@ class TelegramArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot) + duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index f35e323..9f9bbbf 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import TelethonConfig +from configs import Config from utils import getattr_or @@ -15,11 +15,12 @@ class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - def __init__(self, storage: Storage, driver, config: TelethonConfig): - super().__init__(storage, driver) - if config: - self.client = TelegramClient("./anon", config.api_id, config.api_hash) - self.bot_token = config.bot_token + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + if config.telegram_config: + c = config.telegram_config + self.client = TelegramClient("./anon", c.api_id, c.api_hash) + self.bot_token = c.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ @@ -73,6 +74,7 @@ class TelethonArchiver(Archiver): logger.debug(f'got {len(media_posts)=} for {url=}') screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) if len(media_posts) > 0: key = self.get_html_key(url) @@ -80,7 +82,7 @@ class TelethonArchiver(Archiver): if check_if_exists and self.storage.exists(key): # only s3 storage supports storage.exists as not implemented on gd cdn_url = self.storage.get_cdn_url(key) - return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) + return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz) key_thumb, thumb_index = None, None group_id = post.grouped_id if post.grouped_id is not None else post.id @@ -119,7 +121,7 @@ class TelethonArchiver(Archiver): page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index) + return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 8100bb1..bdaad52 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -48,6 +48,7 @@ class TiktokArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) try: os.remove(filename) except FileNotFoundError: @@ -57,7 +58,7 @@ class TiktokArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), - timestamp=timestamp, hash=hash, screenshot=screenshot) + timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) except tiktok_downloader.Except.InvalidUrl as e: status = 'Invalid URL' diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index 6aa1742..454cfe2 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -5,7 +5,7 @@ from loguru import logger from pytwitter import Api from storages.base_storage import Storage -from configs import TwitterApiConfig +from configs import Config from .base_archiver import ArchiveResult from .twitter_archiver import TwitterArchiver @@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver class TwitterApiArchiver(TwitterArchiver): name = "twitter_api" - def __init__(self, storage: Storage, driver, config: TwitterApiConfig): - super().__init__(storage, driver) + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + c = config.twitter_config - if config.bearer_token: - self.api = Api(bearer_token=config.bearer_token) - elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret: + if c.bearer_token: + self.api = Api(bearer_token=c.bearer_token) + elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret: self.api = Api( - consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret) + consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret) def download(self, url, check_if_exists=False): if not hasattr(self, "api"): @@ -69,5 +70,6 @@ class TwitterApiArchiver(TwitterArchiver): }, ensure_ascii=False, indent=4) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 8f646fd..b868af5 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -39,8 +39,9 @@ class TwitterArchiver(Archiver): if tweet.media is None: logger.debug(f'No media found, archiving tweet text only') screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) - return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) urls = [] @@ -59,8 +60,9 @@ class TwitterArchiver(Archiver): page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) def download_alternative(self, url, tweet_id): # https://stackoverflow.com/a/71867055/6196010 @@ -83,8 +85,9 @@ class TwitterArchiver(Archiver): timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"]) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz) def choose_variant(self, variants): # choosing the highest quality possible diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index c448367..91b8354 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import VkConfig +from configs import Config class VkArchiver(Archiver): @@ -17,10 +17,10 @@ class VkArchiver(Archiver): wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - def __init__(self, storage: Storage, driver, config: VkConfig): - super().__init__(storage, driver) - if config != None: - self.vks = VkScraper(config.username, config.password) + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + if config.vk_config != None: + self.vks = VkScraper(config.vk_config.username, config.vk_config.password) def download(self, url, check_if_exists=False): if not hasattr(self, "vks") or self.vks is None: @@ -70,4 +70,5 @@ class VkArchiver(Archiver): page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail) # # if multiple wall/photos/videos are present the screenshot will only grab the 1st screenshot = self.get_screenshot(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title) + wacz = self.get_wacz(url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index f46d1cb..e0ede90 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup from storages import Storage from .base_archiver import Archiver, ArchiveResult -from configs import WaybackConfig +from configs import Config class WaybackArchiver(Archiver): @@ -15,9 +15,9 @@ class WaybackArchiver(Archiver): """ name = "wayback" - def __init__(self, storage: Storage, driver, config: WaybackConfig): - super(WaybackArchiver, self).__init__(storage, driver) - self.config = config + def __init__(self, storage: Storage, config: Config): + super(WaybackArchiver, self).__init__(storage, config) + self.config = config.wayback_config self.seen_urls = {} def download(self, url, check_if_exists=False): @@ -28,6 +28,8 @@ class WaybackArchiver(Archiver): if url in self.seen_urls: return self.seen_urls[url] screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { "Accept": "application/json", @@ -37,10 +39,10 @@ class WaybackArchiver(Archiver): if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") - return ArchiveResult(status="Internet archive failed", screenshot=screenshot) + return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) if 'job_id' not in r.json() and 'message' in r.json(): - return self.custom_retry(r.json(), screenshot=screenshot) + return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz) job_id = r.json()['job_id'] logger.debug(f"GETting status for {job_id=} on {url=}") @@ -59,11 +61,11 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot) + return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz) status_json = status_r.json() if status_json['status'] != 'success': - return self.custom_retry(status_json, screenshot=screenshot) + return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" @@ -75,8 +77,7 @@ class WaybackArchiver(Archiver): title = 'Could not get title' except: title = "Could not get title" - screenshot = self.get_screenshot(url) - self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) + self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) return self.seen_urls[url] def custom_retry(self, json_data, **kwargs): diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 7990131..5d09442 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -6,15 +6,16 @@ from loguru import logger from .base_archiver import Archiver, ArchiveResult from storages import Storage +from configs import Config class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - def __init__(self, storage: Storage, driver, fb_cookie): - super().__init__(storage, driver) - self.fb_cookie = fb_cookie + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + self.fb_cookie = config.facebook_cookie def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) @@ -93,6 +94,7 @@ class YoutubeDLArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) # get duration duration = info.get('duration') @@ -113,4 +115,4 @@ class YoutubeDLArchiver(Archiver): timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) + title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/auto_archive.py b/auto_archive.py index f12b9c4..d657061 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile from loguru import logger from slugify import slugify +from urllib.parse import quote from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url @@ -11,7 +12,7 @@ from storages import Storage random.seed() -def update_sheet(gw, row, result: ArchiveResult): +def update_sheet(gw, row, url, result: ArchiveResult): cell_updates = [] row_values = gw.get_row(row) @@ -30,6 +31,8 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('screenshot', result.screenshot) batch_if_valid('hash', result.hash) + batch_if_valid('wacz', result.wacz) + batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') if result.timestamp is not None: if type(result.timestamp) == int: @@ -104,14 +107,14 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TiktokArchiver(storage, c.webdriver), - TwitterApiArchiver(storage, c.webdriver, c.twitter_config), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), - TelegramArchiver(storage, c.webdriver), - TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), - WaybackArchiver(storage, c.webdriver, c.wayback_config) + TelethonArchiver(storage, c), + TiktokArchiver(storage, c), + TwitterApiArchiver(storage, c), + YoutubeDLArchiver(storage, c), + TelegramArchiver(storage, c), + TwitterArchiver(storage, c), + VkArchiver(storage, c), + WaybackArchiver(storage, c) ] for archiver in active_archivers: @@ -136,7 +139,7 @@ def process_sheet(c: Config): logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') if result: - update_sheet(gw, row, result) + update_sheet(gw, row, url, result) else: gw.set_cell(row, 'status', 'failed: no archiver') except KeyboardInterrupt: diff --git a/configs/browsertrix_config.py b/configs/browsertrix_config.py new file mode 100644 index 0000000..1039da3 --- /dev/null +++ b/configs/browsertrix_config.py @@ -0,0 +1,6 @@ +from dataclasses import dataclass + +@dataclass +class BrowsertrixConfig: + profile: str + timeout_seconds: str diff --git a/configs/config.py b/configs/config.py index 0d11467..beff612 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,6 +1,4 @@ - -import argparse, yaml, json -from archivers.base_archiver import Archiver +import argparse, yaml, json, os import gspread from loguru import logger from selenium import webdriver @@ -13,6 +11,7 @@ from .telethon_config import TelethonConfig from .selenium_config import SeleniumConfig from .vk_config import VkConfig from .twitter_api_config import TwitterApiConfig +from .browsertrix_config import BrowsertrixConfig from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -82,7 +81,16 @@ class Config: ) self.webdriver = "not initialized" - Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM) + # browsertrix config + browsertrix_configs = execution.get("browsertrix", {}) + if len(browsertrix_profile := browsertrix_configs.get("profile", "")): + browsertrix_profile = os.path.abspath(browsertrix_profile) + self.browsertrix_config = BrowsertrixConfig( + profile=browsertrix_profile, + timeout_seconds=browsertrix_configs.get("timeout_seconds", "90") + ) + + self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -208,6 +216,7 @@ class Config: update the folder in each of the storages """ self.folder = folder + logger.info(f"setting folder to {folder}") # s3 if hasattr(self, "s3_config"): self.s3_config.folder = folder if hasattr(self, "s3_storage"): self.s3_storage.folder = folder @@ -263,7 +272,8 @@ class Config: "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, - "hash_algorithm": Archiver.HASH_ALGORITHM, + "hash_algorithm": self.hash_algorithm, + "browsertrix_config": asdict(self.browsertrix_config), "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index acbe52c..a8138af 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -9,6 +9,7 @@ secrets: secret: "s3 API secret" # use region format like such endpoint_url: "https://{region}.digitaloceanspaces.com" + # endpoint_url: "https://s3.{region}.amazonaws.com" #use bucket, region, and key (key is the archived file path generated when executing) format like such as: cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" # if private:true S3 urls will not be readable online @@ -101,6 +102,11 @@ execution: timeout_seconds: 120 window_width: 1400 window_height: 2000 + + # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) + browsertrix: + profile: "./browsertrix/crawls/profile.tar.gz" + timeout_seconds: 90 # defaults to 90s # puts execution logs into /logs folder, defaults to false save_logs: true # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" @@ -119,4 +125,7 @@ execution: duration: duration screenshot: screenshot hash: hash + wacz: wacz + # if you want the replaypage to work, make sure to allow CORS on your bucket + replaywebpage: replaywebpage diff --git a/storages/s3_storage.py b/storages/s3_storage.py index b124aae..563d2ea 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -67,9 +67,14 @@ class S3Storage(Storage): return False def uploadf(self, file, key, **kwargs): - if self.private: - extra_args = kwargs.get("extra_args", {}) - else: - extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) - extra_args['ContentType'] = mimetypes.guess_type(key)[0] + extra_args = kwargs.get("extra_args", {}) + if not self.private and 'ACL' not in extra_args: + extra_args['ACL'] = 'public-read' + + if 'ContentType' not in extra_args: + try: + extra_args['ContentType'] = mimetypes.guess_type(key)[0] + except Exception as e: + logger.error(f"Unable to get mimetype for {key=}, error: {e}") + self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 0e05ab6..8fe640e 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -20,7 +20,9 @@ class GWorksheet: 'title': 'upload title', 'duration': 'duration', 'screenshot': 'screenshot', - 'hash': 'hash' + 'hash': 'hash', + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', } def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):