From 0fae7d96fb219dde7615efa82da6eb281f043340 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Tue, 9 May 2023 12:12:02 +0200 Subject: [PATCH] Detect running in docker container in WACZ enricher --- Dockerfile | 4 +- src/auto_archiver/enrichers/wacz_enricher.py | 53 +++++++++++++------- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index 60805cd..fef93b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,8 @@ # stage 1 - all dependencies FROM webrecorder/browsertrix-crawler:latest +ENV RUNNING_IN_DOCKER=1 + WORKDIR /app # TODO: use custom ffmpeg builds instead of apt-get install @@ -28,7 +30,7 @@ COPY ./src/ . # TODO: figure out how to make volumes not be root, does it depend on host or dockerfile? # RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo . # USER archiver -ENTRYPOINT ["python"] +ENTRYPOINT ["python3"] # ENTRYPOINT ["docker-entrypoint.sh"] # should be executed with 2 volumes (3 if local_storage) diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py index f594efe..c4530ed 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -26,30 +26,49 @@ class WaczEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> bool: # TODO: figure out support for browsertrix in docker + url = to_enrich.get_url() if UrlUtil.is_auth_wall(url): logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") return - - logger.debug(f"generating WACZ for {url=}") + collection = str(uuid.uuid4())[0:8] browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir()) - cmd = [ - "docker", "run", - "--rm", # delete container once it has completed running - "-v", f"{browsertrix_home}:/crawls/", - # "-it", # this leads to "the input device is not a TTY" - "webrecorder/browsertrix-crawler", "crawl", - "--url", url, - "--scopeType", "page", - "--generateWACZ", - "--text", - "--collection", collection, - "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", - "--behaviorTimeout", str(self.timeout), - "--timeout", str(self.timeout) - ] + + if os.getenv('RUNNING_IN_DOCKER'): + logger.debug(f"generating WACZ without Docker for {url=}") + + cmd = [ + "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", str(self.timeout), + "--timeout", str(self.timeout) + ] + else: + logger.debug(f"generating WACZ in Docker for {url=}") + + cmd = [ + "docker", "run", + "--rm", # delete container once it has completed running + "-v", f"{browsertrix_home}:/crawls/", + # "-it", # this leads to "the input device is not a TTY" + "webrecorder/browsertrix-crawler", "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", str(self.timeout), + "--timeout", str(self.timeout) + ] + if self.profile: profile_fn = os.path.join(browsertrix_home, "profile.tar.gz") shutil.copyfile(self.profile, profile_fn)