improves browsertrix configurations

pull/80/head
msramalho 2022-10-18 11:21:10 +01:00
rodzic df502f3bde
commit f0f844a569
5 zmienionych plików z 15 dodań i 5 usunięć

Wyświetl plik

@ -203,6 +203,10 @@ class Archiver(ABC):
return self.storage.get_cdn_url(key) return self.storage.get_cdn_url(key)
def get_wacz(self, url): def get_wacz(self, url):
if not self.browsertrix.enabled:
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
return
logger.debug(f"getting wacz for {url}") logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True) key = self._get_key_from_url(url, ".wacz", append_datetime=True)
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", "")) collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
@ -219,7 +223,8 @@ class Archiver(ABC):
"--text", "--text",
"--collection", collection, "--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.browsertrix.timeout_seconds) "--behaviorTimeout", str(self.browsertrix.timeout_seconds),
"--timeout", str(self.browsertrix.timeout_seconds)
] ]
if not os.path.isdir(browsertrix_home): if not os.path.isdir(browsertrix_home):

Wyświetl plik

@ -31,6 +31,7 @@ def update_sheet(gw, row, url, result: ArchiveResult):
batch_if_valid('duration', result.duration, str(result.duration)) batch_if_valid('duration', result.duration, str(result.duration))
batch_if_valid('screenshot', result.screenshot) batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash) batch_if_valid('hash', result.hash)
if result.wacz is not None:
batch_if_valid('wacz', result.wacz) batch_if_valid('wacz', result.wacz)
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')

Wyświetl plik

@ -2,5 +2,6 @@ from dataclasses import dataclass
@dataclass @dataclass
class BrowsertrixConfig: class BrowsertrixConfig:
enabled: bool
profile: str profile: str
timeout_seconds: str timeout_seconds: str

Wyświetl plik

@ -86,6 +86,7 @@ class Config:
if len(browsertrix_profile := browsertrix_configs.get("profile", "")): if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
browsertrix_profile = os.path.abspath(browsertrix_profile) browsertrix_profile = os.path.abspath(browsertrix_profile)
self.browsertrix_config = BrowsertrixConfig( self.browsertrix_config = BrowsertrixConfig(
enabled=bool(browsertrix_configs.get("enabled", False)),
profile=browsertrix_profile, profile=browsertrix_profile,
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90") timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
) )

Wyświetl plik

@ -103,10 +103,12 @@ execution:
window_width: 1400 window_width: 1400
window_height: 2000 window_height: 2000
# optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) # optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
# browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage
browsertrix: browsertrix:
enabled: true # defaults to false
profile: "./browsertrix/crawls/profile.tar.gz" profile: "./browsertrix/crawls/profile.tar.gz"
timeout_seconds: 90 # defaults to 90s timeout_seconds: 120 # defaults to 90s
# puts execution logs into /logs folder, defaults to false # puts execution logs into /logs folder, defaults to false
save_logs: true save_logs: true
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"