kopia lustrzana https://github.com/bellingcat/auto-archiver
Tweak WACZ enricher docs + add comment on WACZ_ENABLE_DOCKER
rodzic
799cef3a8c
commit
f22af5e123
|
@ -5,6 +5,7 @@ by handling user configuration, validating the steps properties, and implementin
|
|||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import subprocess
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, TYPE_CHECKING, Type
|
||||
|
@ -17,7 +18,7 @@ import os
|
|||
from os.path import join
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
|
||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
|
@ -216,9 +217,9 @@ class LazyBaseModule:
|
|||
if not check(dep):
|
||||
logger.error(
|
||||
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information."
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the documentation for more information."
|
||||
)
|
||||
exit(1)
|
||||
raise SetupError()
|
||||
|
||||
def check_python_dep(dep):
|
||||
# first check if it's a module:
|
||||
|
@ -238,9 +239,18 @@ class LazyBaseModule:
|
|||
return find_spec(dep)
|
||||
|
||||
def check_bin_dep(dep):
|
||||
if dep == "docker" and os.environ.get("RUNNING_IN_DOCKER"):
|
||||
return True
|
||||
return shutil.which(dep)
|
||||
dep_exists = shutil.which(dep)
|
||||
|
||||
if dep == "docker":
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
# this is only for the WACZ enricher, which requires docker
|
||||
# if we're already running in docker then we don't need docker
|
||||
return True
|
||||
|
||||
# check if docker daemon is running
|
||||
return dep_exists and subprocess.run(["docker", "ps", "-q"]).returncode == 0
|
||||
|
||||
return dep_exists
|
||||
|
||||
check_deps(self.dependencies.get("python", []), check_python_dep)
|
||||
check_deps(self.dependencies.get("bin", []), check_bin_dep)
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
"configs": {
|
||||
"profile": {
|
||||
"default": None,
|
||||
"help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
|
||||
"help": "browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).",
|
||||
},
|
||||
"docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||
|
@ -40,14 +40,27 @@
|
|||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
|
||||
|
||||
### Features
|
||||
## Setup
|
||||
|
||||
**Docker**
|
||||
If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
|
||||
Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run
|
||||
the docker daemon to be able to run the `browsertrix-crawler` tool.
|
||||
|
||||
**Browsertrix Profiles**
|
||||
A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
|
||||
You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
|
||||
for more information.
|
||||
|
||||
** Docker in Docker **
|
||||
If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
|
||||
This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
|
||||
|
||||
## Features
|
||||
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
|
||||
- Supports custom profiles for archiving private or dynamic content.
|
||||
- Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
|
||||
- Generates metadata from the archived page's content and structure (e.g., titles, text).
|
||||
|
||||
### Notes
|
||||
- Requires Docker for running `browsertrix-crawler` .
|
||||
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
|
||||
""",
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ from zipfile import ZipFile
|
|||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -22,6 +23,15 @@ def wacz_enricher(setup_module, mock_binary_dependencies):
|
|||
return wacz
|
||||
|
||||
|
||||
def test_raises_error_without_docker_installed(setup_module, mocker, caplog):
|
||||
# pretend that docker isn't installed
|
||||
mocker.patch("shutil.which").return_value = None
|
||||
with pytest.raises(SetupError):
|
||||
setup_module("wacz_extractor_enricher", {})
|
||||
|
||||
assert "requires external dependency 'docker' which is not available/setup" in caplog.text
|
||||
|
||||
|
||||
def test_setup_without_docker(wacz_enricher, mocker):
|
||||
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
|
||||
wacz_enricher.setup()
|
||||
|
|
Ładowanie…
Reference in New Issue