From f7839a99ccd6cd73fab690c5f9194b9a7fa878a9 Mon Sep 17 00:00:00 2001 From: Kai Date: Thu, 14 Sep 2023 18:49:37 +0200 Subject: [PATCH] Add configs for path to write and read wacz archives (#93) Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com> --- Dockerfile | 2 +- src/auto_archiver/enrichers/wacz_enricher.py | 68 +++++++++----------- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0b4abf1..ccbc399 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,4 +28,4 @@ COPY ./src/ . ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"] # should be executed with 2 volumes (3 if local_storage is used) -# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml \ No newline at end of file +# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py index 3cf05d9..d371546 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -28,7 +28,6 @@ class WaczArchiverEnricher(Enricher, Archiver): return { "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"}, - "browsertrix_home": {"default": None, "help": "Path to use with the custom browsertrix file locations, useful together with docker_commands"}, "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"}, "extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."} } @@ -48,50 +47,45 @@ class WaczArchiverEnricher(Enricher, Archiver): url = to_enrich.get_url() collection = str(uuid.uuid4())[0:8] - browsertrix_home = self.browsertrix_home or os.path.abspath(ArchivingContext.get_tmp_dir()) + browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir()) + browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host - if os.environ.get('RUNNING_IN_DOCKER', 0) == '1': - logger.debug(f"generating WACZ without Docker for {url=}") + cmd = [ + "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--screenshot", "fullPage", + "--collection", collection, + "--id", collection, + "--saveState", "never", + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", str(self.timeout), + "--timeout", str(self.timeout)] - cmd = [ - "crawl", - "--url", url, - "--scopeType", "page", - "--generateWACZ", - "--text", - "--screenshot", "fullPage", - "--collection", collection, - "--id", collection, - "--saveState", "never", - "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", - "--behaviorTimeout", str(self.timeout), - "--timeout", str(self.timeout)] + # call docker if explicitly enabled or we are running on the host (not in docker) + use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') - if self.profile: - cmd.extend(["--profile", os.path.join("/app", str(self.profile))]) - else: + if use_docker: logger.debug(f"generating WACZ in Docker for {url=}") + logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}") if not self.docker_commands: - self.docker_commands = ["docker", "run", "--rm", "-v", f"{browsertrix_home}:/crawls/", "webrecorder/browsertrix-crawler"] - cmd = self.docker_commands + [ - "crawl", - "--url", url, - "--scopeType", "page", - "--generateWACZ", - "--text", - "--screenshot", "fullPage", - "--collection", collection, - "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", - "--behaviorTimeout", str(self.timeout), - "--timeout", str(self.timeout) - ] + self.docker_commands = ["docker", "run", "--rm", "-v", f"{browsertrix_home_host}:/crawls/", "webrecorder/browsertrix-crawler"] + cmd = self.docker_commands + cmd if self.profile: - profile_fn = os.path.join(browsertrix_home, "profile.tar.gz") + profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz") logger.debug(f"copying {self.profile} to {profile_fn}") shutil.copyfile(self.profile, profile_fn) cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")]) + else: + logger.debug(f"generating WACZ without Docker for {url=}") + + if self.profile: + cmd.extend(["--profile", os.path.join("/app", str(self.profile))]) + try: logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") subprocess.run(cmd, check=True) @@ -99,10 +93,10 @@ class WaczArchiverEnricher(Enricher, Archiver): logger.error(f"WACZ generation failed: {e}") return False - if os.getenv('RUNNING_IN_DOCKER'): - filename = os.path.join("collections", collection, f"{collection}.wacz") + if use_docker: + filename = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz") else: - filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") + filename = os.path.join("collections", collection, f"{collection}.wacz") if not os.path.exists(filename): logger.warning(f"Unable to locate and upload WACZ {filename=}")