From 3b87dffe6bdee04c2169c5603730d142b6baae4b Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Sun, 25 Sep 2022 19:40:20 +0000 Subject: [PATCH] Add browsertrix-crawler capture The [browsertrix-crawler] utility is a browser-based crawler that can crawl one or more pages. browsertrix-crawler creates archives in the [WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web component, or unzipped to get the original WARC data (the ISO standard format used by the Internet Archive Wayback Machine). This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here: https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0 browsertrix-crawler requires Docker to be installed. If Docker is not installed an error message will be logged and things continue as normal. [browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler [WACZ]: https://specs.webrecorder.net/wacz/latest/ [ReplayWeb.page]: https://replayweb.page --- README.md | 2 ++ archivers/base_archiver.py | 38 ++++++++++++++++++++++++++++++++- archivers/telegram_archiver.py | 3 ++- archivers/tiktok_archiver.py | 3 ++- archivers/twitter_archiver.py | 6 ++++-- archivers/wayback_archiver.py | 11 +++++----- archivers/youtubedl_archiver.py | 3 ++- auto_archive.py | 1 + example.config.yaml | 1 + storages/s3_storage.py | 5 ++++- utils/gworksheet.py | 3 ++- 11 files changed, 63 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ca5e06a..0b79cbb 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ You also need: 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. +6. If you would like to take archival WACZ snapshots using browsertrix-crawler + in addition to screenshots you will need to install Docker. ### Configuration file Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 902f626..91cc25a 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re, mimetypes +import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -24,6 +24,7 @@ class ArchiveResult: title: str = None timestamp: datetime.datetime = None screenshot: str = None + wacz: str = None hash: str = None class Archiver(ABC): @@ -200,6 +201,41 @@ class Archiver(ABC): return self.storage.get_cdn_url(key) + def get_wacz(self, url): + logger.debug(f"getting wacz for {url}") + key = self._get_key_from_url(url, ".wacz", append_datetime=True) + collection = key.replace(".wacz", "").replace("-", "") + + cwd = os.getcwd() + cmd = [ + "docker", "run", + "-v", f"{cwd}/browsertrix:/crawls/", + "-it", + "webrecorder/browsertrix-crawler", "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", "90" + ] + try: + subprocess.run(cmd, check=True) + except Exception as e: + logger.error(f"wacz generation failed: {e}") + return + + filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz") + + self.storage.upload(filename, key, extra_args={ + 'ACL': 'public-read', 'ContentType': 'application/zip'}) + + # TODO: remove wacz collection, waiting for resolution on: + # https://github.com/webrecorder/browsertrix-crawler/issues/170 + + return self.storage.get_cdn_url(key) + def get_thumbnails(self, filename, key, duration=None): thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep key_folder = key.split('.')[0] + os.path.sep diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 0b6e777..d98f761 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -28,6 +28,7 @@ class TelegramArchiver(Archiver): url += "?embed=1" screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) t = requests.get(url, headers=headers) s = BeautifulSoup(t.content, 'html.parser') @@ -46,7 +47,7 @@ class TelegramArchiver(Archiver): time_elements = s.find_all('time') timestamp = time_elements[0].get('datetime') if len(time_elements) else None - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) video_url = video.get('src') video_id = video_url.split('/')[-1].split('?')[0] diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 8100bb1..bdaad52 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -48,6 +48,7 @@ class TiktokArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) try: os.remove(filename) except FileNotFoundError: @@ -57,7 +58,7 @@ class TiktokArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), - timestamp=timestamp, hash=hash, screenshot=screenshot) + timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) except tiktok_downloader.Except.InvalidUrl as e: status = 'Invalid URL' diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 8f646fd..81f20ab 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -39,8 +39,9 @@ class TwitterArchiver(Archiver): if tweet.media is None: logger.debug(f'No media found, archiving tweet text only') screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) - return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) urls = [] @@ -59,8 +60,9 @@ class TwitterArchiver(Archiver): page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) def download_alternative(self, url, tweet_id): # https://stackoverflow.com/a/71867055/6196010 diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index f46d1cb..cf32874 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -28,6 +28,8 @@ class WaybackArchiver(Archiver): if url in self.seen_urls: return self.seen_urls[url] screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { "Accept": "application/json", @@ -37,10 +39,10 @@ class WaybackArchiver(Archiver): if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") - return ArchiveResult(status="Internet archive failed", screenshot=screenshot) + return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) if 'job_id' not in r.json() and 'message' in r.json(): - return self.custom_retry(r.json(), screenshot=screenshot) + return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz) job_id = r.json()['job_id'] logger.debug(f"GETting status for {job_id=} on {url=}") @@ -63,7 +65,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - return self.custom_retry(status_json, screenshot=screenshot) + return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" @@ -75,8 +77,7 @@ class WaybackArchiver(Archiver): title = 'Could not get title' except: title = "Could not get title" - screenshot = self.get_screenshot(url) - self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) + self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) return self.seen_urls[url] def custom_retry(self, json_data, **kwargs): diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 7990131..c66378d 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -93,6 +93,7 @@ class YoutubeDLArchiver(Archiver): hash = self.get_hash(filename) screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) # get duration duration = info.get('duration') @@ -113,4 +114,4 @@ class YoutubeDLArchiver(Archiver): timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) + title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/auto_archive.py b/auto_archive.py index f12b9c4..86d951b 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -30,6 +30,7 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('screenshot', result.screenshot) batch_if_valid('hash', result.hash) + batch_if_valid('wacz', result.wacz) if result.timestamp is not None: if type(result.timestamp) == int: diff --git a/example.config.yaml b/example.config.yaml index acbe52c..c9dd323 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -119,4 +119,5 @@ execution: duration: duration screenshot: screenshot hash: hash + wacz: wacz diff --git a/storages/s3_storage.py b/storages/s3_storage.py index b124aae..fa8e0b9 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -71,5 +71,8 @@ class S3Storage(Storage): extra_args = kwargs.get("extra_args", {}) else: extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) - extra_args['ContentType'] = mimetypes.guess_type(key)[0] + if key.endswith('.wacz'): + extra_args['ContentType'] = "application/zip" + else: + extra_args['ContentType'] = mimetypes.guess_type(key)[0] self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 0e05ab6..eda2cc6 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -20,7 +20,8 @@ class GWorksheet: 'title': 'upload title', 'duration': 'duration', 'screenshot': 'screenshot', - 'hash': 'hash' + 'hash': 'hash', + 'wacz': 'wacz' } def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):