Add browsertrix-crawler capture

The [browsertrix-crawler] utility is a browser-based crawler that can
crawl one or more pages. browsertrix-crawler creates archives in the
[WACZ] format which is essentially a standardized ZIP file (similar to DOCX, EPUB, JAR, etc) which can then be replayed using the [ReplayWeb.page] web
component, or unzipped to get the original WARC data (the ISO standard
format used by the Internet Archive Wayback Machine).

This PR adds browsertrix-crawler to archiver classes where screenshots are made made. The WACZ is uploaded to storage and then added to a new column in the spreadsheet. A column can be added that will display the WACZ, loaded from cloud storage (S3, digitalocean, etc) using the client side ReplayWeb page. You can see an example of the spreadsheet here:

https://docs.google.com/spreadsheets/d/1Tk-iJWzT9Sx2-YccuPttL9HcMdZEnhv_OR7Bc6tfeu8/edit#gid=0

browsertrix-crawler requires Docker to be installed. If Docker is not
installed an error message will be logged and things continue as normal.

[browsertrix-crawler]: https://github.com/webrecorder/browsertrix-crawler
[WACZ]: https://specs.webrecorder.net/wacz/latest/
[ReplayWeb.page]: https://replayweb.page
pull/63/head
Ed Summers 2022-09-25 19:40:20 +00:00
rodzic 0bdd06f641
commit 3b87dffe6b
11 zmienionych plików z 63 dodań i 13 usunięć

Wyświetl plik

@ -18,6 +18,8 @@ You also need:
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
6. If you would like to take archival WACZ snapshots using browsertrix-crawler
in addition to screenshots you will need to install Docker.
### Configuration file ### Configuration file
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:

Wyświetl plik

@ -1,4 +1,4 @@
import os, datetime, shutil, hashlib, time, requests, re, mimetypes import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
from dataclasses import dataclass from dataclasses import dataclass
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from urllib.parse import urlparse from urllib.parse import urlparse
@ -24,6 +24,7 @@ class ArchiveResult:
title: str = None title: str = None
timestamp: datetime.datetime = None timestamp: datetime.datetime = None
screenshot: str = None screenshot: str = None
wacz: str = None
hash: str = None hash: str = None
class Archiver(ABC): class Archiver(ABC):
@ -200,6 +201,41 @@ class Archiver(ABC):
return self.storage.get_cdn_url(key) return self.storage.get_cdn_url(key)
def get_wacz(self, url):
logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
collection = key.replace(".wacz", "").replace("-", "")
cwd = os.getcwd()
cmd = [
"docker", "run",
"-v", f"{cwd}/browsertrix:/crawls/",
"-it",
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", "90"
]
try:
subprocess.run(cmd, check=True)
except Exception as e:
logger.error(f"wacz generation failed: {e}")
return
filename = os.path.join(cwd, "browsertrix", "collections", collection, f"{collection}.wacz")
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'application/zip'})
# TODO: remove wacz collection, waiting for resolution on:
# https://github.com/webrecorder/browsertrix-crawler/issues/170
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None): def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
key_folder = key.split('.')[0] + os.path.sep key_folder = key.split('.')[0] + os.path.sep

Wyświetl plik

@ -28,6 +28,7 @@ class TelegramArchiver(Archiver):
url += "?embed=1" url += "?embed=1"
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
t = requests.get(url, headers=headers) t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser') s = BeautifulSoup(t.content, 'html.parser')
@ -46,7 +47,7 @@ class TelegramArchiver(Archiver):
time_elements = s.find_all('time') time_elements = s.find_all('time')
timestamp = time_elements[0].get('datetime') if len(time_elements) else None timestamp = time_elements[0].get('datetime') if len(time_elements) else None
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
video_url = video.get('src') video_url = video.get('src')
video_id = video_url.split('/')[-1].split('?')[0] video_id = video_url.split('/')[-1].split('?')[0]

Wyświetl plik

@ -48,6 +48,7 @@ class TiktokArchiver(Archiver):
hash = self.get_hash(filename) hash = self.get_hash(filename)
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
try: os.remove(filename) try: os.remove(filename)
except FileNotFoundError: except FileNotFoundError:
@ -57,7 +58,7 @@ class TiktokArchiver(Archiver):
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
timestamp=timestamp, hash=hash, screenshot=screenshot) timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
except tiktok_downloader.Except.InvalidUrl as e: except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL' status = 'Invalid URL'

Wyświetl plik

@ -39,8 +39,9 @@ class TwitterArchiver(Archiver):
if tweet.media is None: if tweet.media is None:
logger.debug(f'No media found, archiving tweet text only') logger.debug(f'No media found, archiving tweet text only')
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
urls = [] urls = []
@ -59,8 +60,9 @@ class TwitterArchiver(Archiver):
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
def download_alternative(self, url, tweet_id): def download_alternative(self, url, tweet_id):
# https://stackoverflow.com/a/71867055/6196010 # https://stackoverflow.com/a/71867055/6196010

Wyświetl plik

@ -28,6 +28,8 @@ class WaybackArchiver(Archiver):
if url in self.seen_urls: return self.seen_urls[url] if url in self.seen_urls: return self.seen_urls[url]
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
logger.debug(f"POSTing {url=} to web.archive.org") logger.debug(f"POSTing {url=} to web.archive.org")
ia_headers = { ia_headers = {
"Accept": "application/json", "Accept": "application/json",
@ -37,10 +39,10 @@ class WaybackArchiver(Archiver):
if r.status_code != 200: if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}") logger.warning(f"Internet archive failed with status of {r.status_code}")
return ArchiveResult(status="Internet archive failed", screenshot=screenshot) return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
if 'job_id' not in r.json() and 'message' in r.json(): if 'job_id' not in r.json() and 'message' in r.json():
return self.custom_retry(r.json(), screenshot=screenshot) return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
job_id = r.json()['job_id'] job_id = r.json()['job_id']
logger.debug(f"GETting status for {job_id=} on {url=}") logger.debug(f"GETting status for {job_id=} on {url=}")
@ -63,7 +65,7 @@ class WaybackArchiver(Archiver):
status_json = status_r.json() status_json = status_r.json()
if status_json['status'] != 'success': if status_json['status'] != 'success':
return self.custom_retry(status_json, screenshot=screenshot) return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz)
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
@ -75,8 +77,7 @@ class WaybackArchiver(Archiver):
title = 'Could not get title' title = 'Could not get title'
except: except:
title = "Could not get title" title = "Could not get title"
screenshot = self.get_screenshot(url) self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
return self.seen_urls[url] return self.seen_urls[url]
def custom_retry(self, json_data, **kwargs): def custom_retry(self, json_data, **kwargs):

Wyświetl plik

@ -93,6 +93,7 @@ class YoutubeDLArchiver(Archiver):
hash = self.get_hash(filename) hash = self.get_hash(filename)
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
# get duration # get duration
duration = info.get('duration') duration = info.get('duration')
@ -113,4 +114,4 @@ class YoutubeDLArchiver(Archiver):
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)

Wyświetl plik

@ -30,6 +30,7 @@ def update_sheet(gw, row, result: ArchiveResult):
batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('duration', result.duration, str(result.duration))
batch_if_valid('screenshot', result.screenshot) batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash) batch_if_valid('hash', result.hash)
batch_if_valid('wacz', result.wacz)
if result.timestamp is not None: if result.timestamp is not None:
if type(result.timestamp) == int: if type(result.timestamp) == int:

Wyświetl plik

@ -119,4 +119,5 @@ execution:
duration: duration duration: duration
screenshot: screenshot screenshot: screenshot
hash: hash hash: hash
wacz: wacz

Wyświetl plik

@ -71,5 +71,8 @@ class S3Storage(Storage):
extra_args = kwargs.get("extra_args", {}) extra_args = kwargs.get("extra_args", {})
else: else:
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
if key.endswith('.wacz'):
extra_args['ContentType'] = "application/zip"
else:
extra_args['ContentType'] = mimetypes.guess_type(key)[0] extra_args['ContentType'] = mimetypes.guess_type(key)[0]
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

Wyświetl plik

@ -20,7 +20,8 @@ class GWorksheet:
'title': 'upload title', 'title': 'upload title',
'duration': 'duration', 'duration': 'duration',
'screenshot': 'screenshot', 'screenshot': 'screenshot',
'hash': 'hash' 'hash': 'hash',
'wacz': 'wacz'
} }
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):