kopia lustrzana https://github.com/bellingcat/auto-archiver
Create an independent profile file for each wacz_extractor_enricher instance
rodzic
244341d22c
commit
e531906d73
|
@ -24,7 +24,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||||
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
||||||
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
||||||
|
|
||||||
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
|
self.crawl_id = random_str(8)
|
||||||
|
self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
|
||||||
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
||||||
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
||||||
# create crawls folder if not exists, so it can be safely removed in cleanup
|
# create crawls folder if not exists, so it can be safely removed in cleanup
|
||||||
|
@ -50,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
|
|
||||||
collection = random_str(8)
|
collection = self.crawl_id
|
||||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
||||||
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||||
|
|
||||||
|
@ -102,10 +103,11 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||||
] + cmd
|
] + cmd
|
||||||
|
|
||||||
if self.profile:
|
if self.profile:
|
||||||
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
|
profile_file = f"profile-{self.crawl_id}.tar.gz"
|
||||||
|
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
||||||
logger.debug(f"copying {self.profile} to {profile_fn}")
|
logger.debug(f"copying {self.profile} to {profile_fn}")
|
||||||
shutil.copyfile(self.profile, profile_fn)
|
shutil.copyfile(self.profile, profile_fn)
|
||||||
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
|
cmd.extend(["--profile", os.path.join("/crawls", profile_file)])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||||
|
|
Ładowanie…
Reference in New Issue