kopia lustrzana https://github.com/bellingcat/auto-archiver
Working, but some cleanup still necessary
rodzic
0fae7d96fb
commit
ac82764ffc
36
Dockerfile
36
Dockerfile
|
@ -2,6 +2,7 @@
|
||||||
FROM webrecorder/browsertrix-crawler:latest
|
FROM webrecorder/browsertrix-crawler:latest
|
||||||
|
|
||||||
ENV RUNNING_IN_DOCKER=1
|
ENV RUNNING_IN_DOCKER=1
|
||||||
|
# ENV NO_PROXY=1
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
@ -16,22 +17,27 @@ RUN pip install --upgrade pip && \
|
||||||
rm geckodriver-v*
|
rm geckodriver-v*
|
||||||
|
|
||||||
|
|
||||||
# install docker for WACZ
|
# # install docker for WACZ
|
||||||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
# # TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||||
# RUN curl -fsSL https://get.docker.com | sh
|
# # RUN curl -fsSL https://get.docker.com | sh
|
||||||
|
|
||||||
# TODO: avoid copying unnecessary files, including .git
|
# # TODO: avoid copying unnecessary files, including .git
|
||||||
COPY Pipfile Pipfile.lock ./
|
COPY Pipfile* ./
|
||||||
RUN pipenv install --python=3.10 --system --deploy
|
RUN pipenv install --skip-lock
|
||||||
# ENV IS_DOCKER=1
|
# # ENV IS_DOCKER=1
|
||||||
# doing this at the end helps during development, builds are quick
|
# # doing this at the end helps during development, builds are quick
|
||||||
COPY ./src/ .
|
COPY ./src/ .
|
||||||
|
|
||||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
# # TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
# # RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||||
# USER archiver
|
# # USER archiver
|
||||||
ENTRYPOINT ["python3"]
|
# # ENTRYPOINT ["python3"]
|
||||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
# # ENTRYPOINT ["docker-entrypoint.sh"]
|
||||||
|
|
||||||
# should be executed with 2 volumes (3 if local_storage)
|
ADD docker-entrypoint.sh /docker-entrypoint.sh
|
||||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||||
|
|
||||||
|
CMD ["python3"]
|
||||||
|
|
||||||
|
# # should be executed with 2 volumes (3 if local_storage)
|
||||||
|
# # docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
6
Pipfile
6
Pipfile
|
@ -30,9 +30,13 @@ cryptography = "==38.0.4"
|
||||||
dataclasses-json = "*"
|
dataclasses-json = "*"
|
||||||
yt-dlp = ">=2023.2.17"
|
yt-dlp = ">=2023.2.17"
|
||||||
vk-url-scraper = "*"
|
vk-url-scraper = "*"
|
||||||
|
pywb = ">=2.7.3"
|
||||||
|
uwsgi = "*"
|
||||||
|
requests = {extras = ["socks"], version = "*"}
|
||||||
|
# wacz = "==0.4.8"
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.9"
|
python_version = "3.10"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
autopep8 = "*"
|
autopep8 = "*"
|
||||||
|
|
Plik diff jest za duży
Load Diff
|
@ -0,0 +1,27 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# Get UID/GID from volume dir
|
||||||
|
|
||||||
|
VOLUME_UID=$(stat -c '%u' /crawls)
|
||||||
|
VOLUME_GID=$(stat -c '%g' /crawls)
|
||||||
|
|
||||||
|
# Get the UID/GID we are running as
|
||||||
|
|
||||||
|
MY_UID=$(id -u)
|
||||||
|
MY_GID=$(id -g)
|
||||||
|
|
||||||
|
# If we aren't running as the owner of the /crawls/ dir then add a new user
|
||||||
|
# btrix with the same UID/GID of the /crawls dir and run as that user instead.
|
||||||
|
|
||||||
|
if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
|
||||||
|
groupadd btrix
|
||||||
|
groupmod -o --gid $VOLUME_GID btrix
|
||||||
|
|
||||||
|
useradd -ms /bin/bash -g $VOLUME_GID btrix
|
||||||
|
usermod -o -u $VOLUME_UID btrix > /dev/null
|
||||||
|
|
||||||
|
su btrix -c '"$@"' -- argv0-ignore "$@"
|
||||||
|
else
|
||||||
|
exec "$@"
|
||||||
|
fi
|
||||||
|
|
|
@ -46,9 +46,12 @@ class WaczEnricher(Enricher):
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text",
|
"--text",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
|
"--id", collection,
|
||||||
|
"--saveState", "never",
|
||||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
"--behaviorTimeout", str(self.timeout),
|
"--behaviorTimeout", str(self.timeout),
|
||||||
"--timeout", str(self.timeout)
|
"--timeout", str(self.timeout),
|
||||||
|
"--profile", str(self.profile)
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||||
|
@ -69,12 +72,12 @@ class WaczEnricher(Enricher):
|
||||||
"--timeout", str(self.timeout)
|
"--timeout", str(self.timeout)
|
||||||
]
|
]
|
||||||
|
|
||||||
if self.profile:
|
if self.profile:
|
||||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||||
shutil.copyfile(self.profile, profile_fn)
|
shutil.copyfile(self.profile, profile_fn)
|
||||||
# TODO: test which is right
|
# TODO: test which is right
|
||||||
cmd.extend(["--profile", profile_fn])
|
cmd.extend(["--profile", profile_fn])
|
||||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||||
|
@ -83,7 +86,13 @@ class WaczEnricher(Enricher):
|
||||||
logger.error(f"WACZ generation failed: {e}")
|
logger.error(f"WACZ generation failed: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
|
||||||
|
|
||||||
|
if os.getenv('RUNNING_IN_DOCKER'):
|
||||||
|
filename = os.path.join("collections", collection, f"{collection}.wacz")
|
||||||
|
else:
|
||||||
|
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||||
|
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
||||||
return False
|
return False
|
||||||
|
|
Ładowanie…
Reference in New Issue