auto-archiver/tests/enrichers/test_wacz_enricher.py

113 wiersze
4.0 KiB
Python

import os
from zipfile import ZipFile
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def wacz_enricher(setup_module, mock_binary_dependencies):
configs: dict = {
"profile": None,
"docker_commands": None,
"timeout": 120,
"extract_media": False,
"extract_screenshot": True,
"socks_proxy_host": None,
"socks_proxy_port": None,
"proxy_server": None,
}
wacz = setup_module("wacz_extractor_enricher", configs)
return wacz
def test_setup_without_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert not wacz_enricher.docker_in_docker
def test_setup_with_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert wacz_enricher.use_docker
def test_already_ran(wacz_enricher, metadata, mocker):
metadata.add_media(Media("test.wacz"), id="browsertrix")
mock_log = mocker.patch("loguru.logger.info")
assert wacz_enricher.enrich(metadata) is True
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
def test_basic_call_execution(wacz_enricher, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.return_value = mocker.Mock(returncode=0)
metadata = Metadata().set_url("https://example.com")
wacz_enricher.enrich(metadata)
assert mock_run.called
# Checks that the url is passed to the cmd
assert "--url https://example.com" in " ".join(mock_run.call_args[0][0])
def test_download_success(wacz_enricher, mocker) -> None:
"""Test download returns metadata on successful enrichment."""
basic_metadata = Metadata().set_url("https://example.com")
mocker.patch.object(wacz_enricher, "enrich", return_value=True)
result = wacz_enricher.download(basic_metadata)
assert result is not None
assert isinstance(result, Metadata)
assert result.status == "wacz: success"
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
"""Test enrich if already executed."""
mock_log = mocker.patch("loguru.logger.info")
metadata = Metadata().set_url("https://example.com")
media = Media(filename="some_file.wacz")
metadata.add_media(media, id="browsertrix")
result = wacz_enricher.enrich(metadata)
assert result is True
assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0]
def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None:
"""Test enrich returns False when subprocess fails."""
wacz_enricher.tmp_dir = str(tmp_path)
wacz_enricher.extract_media = False
wacz_enricher.extract_screenshot = True
mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL")
mocker.patch("subprocess.run", side_effect=Exception("fail"))
basic_metadata = Metadata().set_url("https://example.com")
result = wacz_enricher.enrich(basic_metadata)
assert result is False
def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
"""Test extract_media_from_wacz extracts screenshot media."""
wacz_enricher.tmp_dir = str(tmp_path)
# Create a *real* zip file so ZipFile won't fail.
wacz_file = tmp_path / "dummy.wacz"
with ZipFile(wacz_file, "w") as zf:
zf.writestr("dummy.txt", "test content")
mocker.patch("os.listdir", return_value=[])
warc_data = (
b"WARC/1.0\r\n"
b"WARC-Type: resource\r\n"
b"Content-Type: image/png\r\n"
b"WARC-Target-URI: http://example.com/image.png\r\n"
b"Content-Length: 12\r\n"
b"\r\n"
b"image-bytes"
b"\r\n\r\nWARC/1.0\r\n\r\n"
)
mock_file = mocker.mock_open(read_data=warc_data)
mocker.patch("builtins.open", mock_file)
metadata.add_media(Media("something.wacz"), "browsertrix")
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
assert len(metadata.media) == 2
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"