modifies base docker image to use browsertrix 1.4.2 (#182)

* modifies base image to newest browsertrix version

* modify browsertrix cmd args based on recent experience
pull/196/head
Miguel Sozinho Ramalho 2025-01-24 13:59:29 +00:00 zatwierdzone przez GitHub
rodzic d4fff0b6eb
commit a6fc4e1bb1
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
2 zmienionych plików z 9 dodań i 6 usunięć

Wyświetl plik

@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:1.0.4 AS base
FROM webrecorder/browsertrix-crawler:1.4.2 AS base
ENV RUNNING_IN_DOCKER=1 \
LANG=C.UTF-8 \
@ -29,21 +29,23 @@ ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_CREATE=1
RUN pip install --upgrade pip && \
pip install "poetry>=2.0.0,<3.0.0"
# Create a virtual environment for poetry and install it
RUN python3 -m venv /poetry-venv && \
/poetry-venv/bin/python -m pip install --upgrade pip && \
/poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0"
WORKDIR /app
COPY pyproject.toml poetry.lock README.md ./
# Copy dependency files and install dependencies (excluding the package itself)
RUN poetry install --only main --no-root --no-cache
RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache
# Copy code: This is needed for poetry to install the package itself,
# but the environment should be cached from the previous step if toml and lock files haven't changed
COPY ./src/ .
RUN poetry install --only main --no-cache
RUN /poetry-venv/bin/poetry install --only main --no-cache
# Update PATH to include virtual environment binaries

Wyświetl plik

@ -84,7 +84,8 @@ class WaczArchiverEnricher(Enricher, Archiver):
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout),
"--blockAds" # TODO: test
"--diskUtilization", "99",
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
]
if self.docker_in_docker: