kopia lustrzana https://github.com/bellingcat/auto-archiver
Working, but some cleanup still necessary
rodzic
0fae7d96fb
commit
ac82764ffc
36
Dockerfile
36
Dockerfile
|
@ -2,6 +2,7 @@
|
|||
FROM webrecorder/browsertrix-crawler:latest
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1
|
||||
# ENV NO_PROXY=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
@ -16,22 +17,27 @@ RUN pip install --upgrade pip && \
|
|||
rm geckodriver-v*
|
||||
|
||||
|
||||
# install docker for WACZ
|
||||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||
# RUN curl -fsSL https://get.docker.com | sh
|
||||
# # install docker for WACZ
|
||||
# # TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||
# # RUN curl -fsSL https://get.docker.com | sh
|
||||
|
||||
# TODO: avoid copying unnecessary files, including .git
|
||||
COPY Pipfile Pipfile.lock ./
|
||||
RUN pipenv install --python=3.10 --system --deploy
|
||||
# ENV IS_DOCKER=1
|
||||
# doing this at the end helps during development, builds are quick
|
||||
# # TODO: avoid copying unnecessary files, including .git
|
||||
COPY Pipfile* ./
|
||||
RUN pipenv install --skip-lock
|
||||
# # ENV IS_DOCKER=1
|
||||
# # doing this at the end helps during development, builds are quick
|
||||
COPY ./src/ .
|
||||
|
||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# USER archiver
|
||||
ENTRYPOINT ["python3"]
|
||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
# # TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# # RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# # USER archiver
|
||||
# # ENTRYPOINT ["python3"]
|
||||
# # ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage)
|
||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
||||
ADD docker-entrypoint.sh /docker-entrypoint.sh
|
||||
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||
|
||||
CMD ["python3"]
|
||||
|
||||
# # should be executed with 2 volumes (3 if local_storage)
|
||||
# # docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
6
Pipfile
6
Pipfile
|
@ -30,9 +30,13 @@ cryptography = "==38.0.4"
|
|||
dataclasses-json = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
vk-url-scraper = "*"
|
||||
pywb = ">=2.7.3"
|
||||
uwsgi = "*"
|
||||
requests = {extras = ["socks"], version = "*"}
|
||||
# wacz = "==0.4.8"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.10"
|
||||
|
||||
[dev-packages]
|
||||
autopep8 = "*"
|
||||
|
|
Plik diff jest za duży
Load Diff
|
@ -0,0 +1,27 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Get UID/GID from volume dir
|
||||
|
||||
VOLUME_UID=$(stat -c '%u' /crawls)
|
||||
VOLUME_GID=$(stat -c '%g' /crawls)
|
||||
|
||||
# Get the UID/GID we are running as
|
||||
|
||||
MY_UID=$(id -u)
|
||||
MY_GID=$(id -g)
|
||||
|
||||
# If we aren't running as the owner of the /crawls/ dir then add a new user
|
||||
# btrix with the same UID/GID of the /crawls dir and run as that user instead.
|
||||
|
||||
if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
|
||||
groupadd btrix
|
||||
groupmod -o --gid $VOLUME_GID btrix
|
||||
|
||||
useradd -ms /bin/bash -g $VOLUME_GID btrix
|
||||
usermod -o -u $VOLUME_UID btrix > /dev/null
|
||||
|
||||
su btrix -c '"$@"' -- argv0-ignore "$@"
|
||||
else
|
||||
exec "$@"
|
||||
fi
|
||||
|
|
@ -46,9 +46,12 @@ class WaczEnricher(Enricher):
|
|||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--id", collection,
|
||||
"--saveState", "never",
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
"--timeout", str(self.timeout),
|
||||
"--profile", str(self.profile)
|
||||
]
|
||||
else:
|
||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||
|
@ -69,12 +72,12 @@ class WaczEnricher(Enricher):
|
|||
"--timeout", str(self.timeout)
|
||||
]
|
||||
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
|
@ -83,7 +86,13 @@ class WaczEnricher(Enricher):
|
|||
logger.error(f"WACZ generation failed: {e}")
|
||||
return False
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
filename = os.path.join("collections", collection, f"{collection}.wacz")
|
||||
else:
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
if not os.path.exists(filename):
|
||||
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
||||
return False
|
||||
|
|
Ładowanie…
Reference in New Issue