kopia lustrzana https://github.com/bellingcat/auto-archiver
Detect running in docker container in WACZ enricher
rodzic
2f7181ced6
commit
0fae7d96fb
|
@ -1,6 +1,8 @@
|
|||
# stage 1 - all dependencies
|
||||
FROM webrecorder/browsertrix-crawler:latest
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# TODO: use custom ffmpeg builds instead of apt-get install
|
||||
|
@ -28,7 +30,7 @@ COPY ./src/ .
|
|||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# USER archiver
|
||||
ENTRYPOINT ["python"]
|
||||
ENTRYPOINT ["python3"]
|
||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage)
|
||||
|
|
|
@ -26,30 +26,49 @@ class WaczEnricher(Enricher):
|
|||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"generating WACZ for {url=}")
|
||||
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
else:
|
||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
|
|
Ładowanie…
Reference in New Issue