diff --git a/README.md b/README.md index ca5e06a..0b79cbb 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ You also need: 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. +6. If you would like to take archival WACZ snapshots using browsertrix-crawler + in addition to screenshots you will need to install Docker. ### Configuration file Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 902f626..91cc25a 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re, mimetypes +import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -24,6 +24,7 @@ class ArchiveResult: title: str = None timestamp: datetime.datetime = None screenshot: str = None + wacz: str = None hash: str = None class Archiver(ABC): @@ -200,6 +201,41 @@ class Archiver(ABC): return self.storage.get_cdn_url(key) + def get_wacz(self, url): + logger.debug(f"getting wacz for {url}") + key = self._get_key_from_url(url, ".wacz", append_datetime=True) + collection = key.replace(".wacz", "").replace("-", "") + + cwd = os.getcwd() + cmd = [ + "docker", "run", + "-v", f"{cwd}/browsertrix:/crawls/", + "-it", + "webrecorder/browsertrix-crawler", "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", "90" + ] + try: + subprocess.run(cmd, check=True) + except Exception as e: + logger.error(f"wacz generation failed: {e}") + return + + filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz") + + self.storage.upload(filename, key, extra_args={ + 'ACL': 'public-read', 'ContentType': 'application/zip'}) + + # TODO: remove wacz collection, waiting for resolution on: + # https://github.com/webrecorder/browsertrix-crawler/issues/170 + + return self.storage.get_cdn_url(key) + def get_thumbnails(self, filename, key, duration=None): thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep key_folder = key.split('.')[0] + os.path.sep diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 0b6e777..d98f761 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -28,6 +28,7 @@ class TelegramArchiver(Archiver): url += "?embed=1" screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) t = requests.get(url, headers=headers) s = BeautifulSoup(t.content, 'html.parser') @@ -46,7 +47,7 @@ class TelegramArchiver(Archiver): time_elements = s.find_all('time') timestamp = time_elements[0].get('datetime') if len(time_elements) else None - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) video_url = video.get('src') video_id = video_url.split('/')[-1].split('?')[0] diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 8100bb1..bdaad52 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -48,6 +48,7 @@ class TiktokArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) try: os.remove(filename) except FileNotFoundError: @@ -57,7 +58,7 @@ class TiktokArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), - timestamp=timestamp, hash=hash, screenshot=screenshot) + timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) except tiktok_downloader.Except.InvalidUrl as e: status = 'Invalid URL' diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 8f646fd..81f20ab 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -39,8 +39,9 @@ class TwitterArchiver(Archiver): if tweet.media is None: logger.debug(f'No media found, archiving tweet text only') screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) - return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) urls = [] @@ -59,8 +60,9 @@ class TwitterArchiver(Archiver): page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) def download_alternative(self, url, tweet_id): # https://stackoverflow.com/a/71867055/6196010 diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index f46d1cb..cf32874 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -28,6 +28,8 @@ class WaybackArchiver(Archiver): if url in self.seen_urls: return self.seen_urls[url] screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { "Accept": "application/json", @@ -37,10 +39,10 @@ class WaybackArchiver(Archiver): if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") - return ArchiveResult(status="Internet archive failed", screenshot=screenshot) + return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) if 'job_id' not in r.json() and 'message' in r.json(): - return self.custom_retry(r.json(), screenshot=screenshot) + return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz) job_id = r.json()['job_id'] logger.debug(f"GETting status for {job_id=} on {url=}") @@ -63,7 +65,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - return self.custom_retry(status_json, screenshot=screenshot) + return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" @@ -75,8 +77,7 @@ class WaybackArchiver(Archiver): title = 'Could not get title' except: title = "Could not get title" - screenshot = self.get_screenshot(url) - self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) + self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) return self.seen_urls[url] def custom_retry(self, json_data, **kwargs): diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 7990131..c66378d 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -93,6 +93,7 @@ class YoutubeDLArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) # get duration duration = info.get('duration') @@ -113,4 +114,4 @@ class YoutubeDLArchiver(Archiver): timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) + title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/auto_archive.py b/auto_archive.py index f12b9c4..86d951b 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -30,6 +30,7 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('screenshot', result.screenshot) batch_if_valid('hash', result.hash) + batch_if_valid('wacz', result.wacz) if result.timestamp is not None: if type(result.timestamp) == int: diff --git a/example.config.yaml b/example.config.yaml index acbe52c..c9dd323 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -119,4 +119,5 @@ execution: duration: duration screenshot: screenshot hash: hash + wacz: wacz diff --git a/storages/s3_storage.py b/storages/s3_storage.py index b124aae..fa8e0b9 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -71,5 +71,8 @@ class S3Storage(Storage): extra_args = kwargs.get("extra_args", {}) else: extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) - extra_args['ContentType'] = mimetypes.guess_type(key)[0] + if key.endswith('.wacz'): + extra_args['ContentType'] = "application/zip" + else: + extra_args['ContentType'] = mimetypes.guess_type(key)[0] self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 0e05ab6..eda2cc6 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -20,7 +20,8 @@ class GWorksheet: 'title': 'upload title', 'duration': 'duration', 'screenshot': 'screenshot', - 'hash': 'hash' + 'hash': 'hash', + 'wacz': 'wacz' } def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):