From a6fc4e1bb134ff72f30e87cf3ac35324c3c8085d Mon Sep 17 00:00:00 2001 From: Miguel Sozinho Ramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 24 Jan 2025 13:59:29 +0000 Subject: [PATCH] modifies base docker image to use browsertrix 1.4.2 (#182) * modifies base image to newest browsertrix version * modify browsertrix cmd args based on recent experience --- Dockerfile | 12 +++++++----- src/auto_archiver/enrichers/wacz_enricher.py | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0ecc7f3..cbcfdd4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.4 AS base +FROM webrecorder/browsertrix-crawler:1.4.2 AS base ENV RUNNING_IN_DOCKER=1 \ LANG=C.UTF-8 \ @@ -29,21 +29,23 @@ ENV POETRY_NO_INTERACTION=1 \ POETRY_VIRTUALENVS_CREATE=1 -RUN pip install --upgrade pip && \ - pip install "poetry>=2.0.0,<3.0.0" +# Create a virtual environment for poetry and install it +RUN python3 -m venv /poetry-venv && \ + /poetry-venv/bin/python -m pip install --upgrade pip && \ + /poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0" WORKDIR /app COPY pyproject.toml poetry.lock README.md ./ # Copy dependency files and install dependencies (excluding the package itself) -RUN poetry install --only main --no-root --no-cache +RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache # Copy code: This is needed for poetry to install the package itself, # but the environment should be cached from the previous step if toml and lock files haven't changed COPY ./src/ . -RUN poetry install --only main --no-cache +RUN /poetry-venv/bin/poetry install --only main --no-cache # Update PATH to include virtual environment binaries diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py index dc38488..3c39056 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -84,7 +84,8 @@ class WaczArchiverEnricher(Enricher, Archiver): "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", "--behaviorTimeout", str(self.timeout), "--timeout", str(self.timeout), - "--blockAds" # TODO: test + "--diskUtilization", "99", + # "--blockAds" # note: this has been known to cause issues on cloudflare protected sites ] if self.docker_in_docker: