Merge branch 'dockerize'

pull/80/head
Logan Williams 2023-05-11 15:09:02 +02:00
commit 074f132ad9
1 zmienionych plików z 5 dodań i 12 usunięć

Wyświetl plik

@ -25,13 +25,7 @@ class WaczEnricher(Enricher):
} }
def enrich(self, to_enrich: Metadata) -> bool: def enrich(self, to_enrich: Metadata) -> bool:
# TODO: figure out support for browsertrix in docker
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
collection = str(uuid.uuid4())[0:8] collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir()) browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
@ -50,9 +44,10 @@ class WaczEnricher(Enricher):
"--saveState", "never", "--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout), "--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout), "--timeout", str(self.timeout)]
"--profile", str(self.profile)
] if self.profile:
cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
else: else:
logger.debug(f"generating WACZ in Docker for {url=}") logger.debug(f"generating WACZ in Docker for {url=}")
@ -75,9 +70,7 @@ class WaczEnricher(Enricher):
if self.profile: if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz") profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn) shutil.copyfile(self.profile, profile_fn)
# TODO: test which is right cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
cmd.extend(["--profile", profile_fn])
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
try: try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")