Working, but some cleanup still necessary

pull/74/head
Logan Williams 2023-05-09 17:38:17 +02:00
rodzic 0fae7d96fb
commit ac82764ffc
5 zmienionych plików z 1077 dodań i 326 usunięć

Wyświetl plik

@ -2,6 +2,7 @@
FROM webrecorder/browsertrix-crawler:latest
ENV RUNNING_IN_DOCKER=1
# ENV NO_PROXY=1
WORKDIR /app
@ -16,22 +17,27 @@ RUN pip install --upgrade pip && \
rm geckodriver-v*
# install docker for WACZ
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
# RUN curl -fsSL https://get.docker.com | sh
# # install docker for WACZ
# # TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
# # RUN curl -fsSL https://get.docker.com | sh
# TODO: avoid copying unnecessary files, including .git
COPY Pipfile Pipfile.lock ./
RUN pipenv install --python=3.10 --system --deploy
# ENV IS_DOCKER=1
# doing this at the end helps during development, builds are quick
# # TODO: avoid copying unnecessary files, including .git
COPY Pipfile* ./
RUN pipenv install --skip-lock
# # ENV IS_DOCKER=1
# # doing this at the end helps during development, builds are quick
COPY ./src/ .
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
# USER archiver
ENTRYPOINT ["python3"]
# ENTRYPOINT ["docker-entrypoint.sh"]
# # TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
# # RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
# # USER archiver
# # ENTRYPOINT ["python3"]
# # ENTRYPOINT ["docker-entrypoint.sh"]
# should be executed with 2 volumes (3 if local_storage)
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
ADD docker-entrypoint.sh /docker-entrypoint.sh
ENTRYPOINT ["/docker-entrypoint.sh"]
CMD ["python3"]
# # should be executed with 2 volumes (3 if local_storage)
# # docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help

Wyświetl plik

@ -30,9 +30,13 @@ cryptography = "==38.0.4"
dataclasses-json = "*"
yt-dlp = ">=2023.2.17"
vk-url-scraper = "*"
pywb = ">=2.7.3"
uwsgi = "*"
requests = {extras = ["socks"], version = "*"}
# wacz = "==0.4.8"
[requires]
python_version = "3.9"
python_version = "3.10"
[dev-packages]
autopep8 = "*"

1309
Pipfile.lock wygenerowano

Plik diff jest za duży Load Diff

Wyświetl plik

@ -0,0 +1,27 @@
#!/bin/sh
# Get UID/GID from volume dir
VOLUME_UID=$(stat -c '%u' /crawls)
VOLUME_GID=$(stat -c '%g' /crawls)
# Get the UID/GID we are running as
MY_UID=$(id -u)
MY_GID=$(id -g)
# If we aren't running as the owner of the /crawls/ dir then add a new user
# btrix with the same UID/GID of the /crawls dir and run as that user instead.
if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
groupadd btrix
groupmod -o --gid $VOLUME_GID btrix
useradd -ms /bin/bash -g $VOLUME_GID btrix
usermod -o -u $VOLUME_UID btrix > /dev/null
su btrix -c '"$@"' -- argv0-ignore "$@"
else
exec "$@"
fi

Wyświetl plik

@ -46,9 +46,12 @@ class WaczEnricher(Enricher):
"--generateWACZ",
"--text",
"--collection", collection,
"--id", collection,
"--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
"--timeout", str(self.timeout),
"--profile", str(self.profile)
]
else:
logger.debug(f"generating WACZ in Docker for {url=}")
@ -69,12 +72,12 @@ class WaczEnricher(Enricher):
"--timeout", str(self.timeout)
]
if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn)
# TODO: test which is right
cmd.extend(["--profile", profile_fn])
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn)
# TODO: test which is right
cmd.extend(["--profile", profile_fn])
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
@ -83,7 +86,13 @@ class WaczEnricher(Enricher):
logger.error(f"WACZ generation failed: {e}")
return False
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
if os.getenv('RUNNING_IN_DOCKER'):
filename = os.path.join("collections", collection, f"{collection}.wacz")
else:
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
if not os.path.exists(filename):
logger.warning(f"Unable to locate and upload WACZ {filename=}")
return False