kopia lustrzana https://github.com/bellingcat/auto-archiver
improves browsertrix configurations
rodzic
df502f3bde
commit
f0f844a569
|
@ -203,6 +203,10 @@ class Archiver(ABC):
|
||||||
return self.storage.get_cdn_url(key)
|
return self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
def get_wacz(self, url):
|
def get_wacz(self, url):
|
||||||
|
if not self.browsertrix.enabled:
|
||||||
|
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
|
||||||
|
return
|
||||||
|
|
||||||
logger.debug(f"getting wacz for {url}")
|
logger.debug(f"getting wacz for {url}")
|
||||||
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
||||||
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
|
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
|
||||||
|
@ -219,7 +223,8 @@ class Archiver(ABC):
|
||||||
"--text",
|
"--text",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
"--behaviorTimeout", str(self.browsertrix.timeout_seconds)
|
"--behaviorTimeout", str(self.browsertrix.timeout_seconds),
|
||||||
|
"--timeout", str(self.browsertrix.timeout_seconds)
|
||||||
]
|
]
|
||||||
|
|
||||||
if not os.path.isdir(browsertrix_home):
|
if not os.path.isdir(browsertrix_home):
|
||||||
|
|
|
@ -31,6 +31,7 @@ def update_sheet(gw, row, url, result: ArchiveResult):
|
||||||
batch_if_valid('duration', result.duration, str(result.duration))
|
batch_if_valid('duration', result.duration, str(result.duration))
|
||||||
batch_if_valid('screenshot', result.screenshot)
|
batch_if_valid('screenshot', result.screenshot)
|
||||||
batch_if_valid('hash', result.hash)
|
batch_if_valid('hash', result.hash)
|
||||||
|
if result.wacz is not None:
|
||||||
batch_if_valid('wacz', result.wacz)
|
batch_if_valid('wacz', result.wacz)
|
||||||
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
||||||
|
|
||||||
|
|
|
@ -2,5 +2,6 @@ from dataclasses import dataclass
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BrowsertrixConfig:
|
class BrowsertrixConfig:
|
||||||
|
enabled: bool
|
||||||
profile: str
|
profile: str
|
||||||
timeout_seconds: str
|
timeout_seconds: str
|
||||||
|
|
|
@ -86,6 +86,7 @@ class Config:
|
||||||
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
|
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
|
||||||
browsertrix_profile = os.path.abspath(browsertrix_profile)
|
browsertrix_profile = os.path.abspath(browsertrix_profile)
|
||||||
self.browsertrix_config = BrowsertrixConfig(
|
self.browsertrix_config = BrowsertrixConfig(
|
||||||
|
enabled=bool(browsertrix_configs.get("enabled", False)),
|
||||||
profile=browsertrix_profile,
|
profile=browsertrix_profile,
|
||||||
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
|
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
|
||||||
)
|
)
|
||||||
|
|
|
@ -103,10 +103,12 @@ execution:
|
||||||
window_width: 1400
|
window_width: 1400
|
||||||
window_height: 2000
|
window_height: 2000
|
||||||
|
|
||||||
# optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
|
# optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
|
||||||
|
# browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage
|
||||||
browsertrix:
|
browsertrix:
|
||||||
|
enabled: true # defaults to false
|
||||||
profile: "./browsertrix/crawls/profile.tar.gz"
|
profile: "./browsertrix/crawls/profile.tar.gz"
|
||||||
timeout_seconds: 90 # defaults to 90s
|
timeout_seconds: 120 # defaults to 90s
|
||||||
# puts execution logs into /logs folder, defaults to false
|
# puts execution logs into /logs folder, defaults to false
|
||||||
save_logs: true
|
save_logs: true
|
||||||
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
||||||
|
|
Ładowanie…
Reference in New Issue