kopia lustrzana https://github.com/bellingcat/auto-archiver
113 wiersze
4.0 KiB
Python
113 wiersze
4.0 KiB
Python
import os
|
|
from zipfile import ZipFile
|
|
|
|
import pytest
|
|
|
|
from auto_archiver.core import Metadata, Media
|
|
|
|
|
|
@pytest.fixture
|
|
def wacz_enricher(setup_module, mock_binary_dependencies):
|
|
configs: dict = {
|
|
"profile": None,
|
|
"docker_commands": None,
|
|
"timeout": 120,
|
|
"extract_media": False,
|
|
"extract_screenshot": True,
|
|
"socks_proxy_host": None,
|
|
"socks_proxy_port": None,
|
|
"proxy_server": None,
|
|
}
|
|
wacz = setup_module("wacz_extractor_enricher", configs)
|
|
return wacz
|
|
|
|
|
|
def test_setup_without_docker(wacz_enricher, mocker):
|
|
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
|
|
wacz_enricher.setup()
|
|
assert not wacz_enricher.docker_in_docker
|
|
|
|
|
|
def test_setup_with_docker(wacz_enricher, mocker):
|
|
mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True)
|
|
wacz_enricher.setup()
|
|
assert wacz_enricher.use_docker
|
|
|
|
|
|
def test_already_ran(wacz_enricher, metadata, mocker):
|
|
metadata.add_media(Media("test.wacz"), id="browsertrix")
|
|
mock_log = mocker.patch("loguru.logger.info")
|
|
assert wacz_enricher.enrich(metadata) is True
|
|
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
|
|
|
|
|
|
def test_basic_call_execution(wacz_enricher, mocker):
|
|
mock_run = mocker.patch("subprocess.run")
|
|
mock_run.return_value = mocker.Mock(returncode=0)
|
|
metadata = Metadata().set_url("https://example.com")
|
|
wacz_enricher.enrich(metadata)
|
|
assert mock_run.called
|
|
# Checks that the url is passed to the cmd
|
|
assert "--url https://example.com" in " ".join(mock_run.call_args[0][0])
|
|
|
|
|
|
def test_download_success(wacz_enricher, mocker) -> None:
|
|
"""Test download returns metadata on successful enrichment."""
|
|
basic_metadata = Metadata().set_url("https://example.com")
|
|
mocker.patch.object(wacz_enricher, "enrich", return_value=True)
|
|
result = wacz_enricher.download(basic_metadata)
|
|
assert result is not None
|
|
assert isinstance(result, Metadata)
|
|
assert result.status == "wacz: success"
|
|
|
|
|
|
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
|
|
"""Test enrich if already executed."""
|
|
mock_log = mocker.patch("loguru.logger.info")
|
|
metadata = Metadata().set_url("https://example.com")
|
|
media = Media(filename="some_file.wacz")
|
|
metadata.add_media(media, id="browsertrix")
|
|
result = wacz_enricher.enrich(metadata)
|
|
assert result is True
|
|
assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0]
|
|
|
|
|
|
def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None:
|
|
"""Test enrich returns False when subprocess fails."""
|
|
wacz_enricher.tmp_dir = str(tmp_path)
|
|
wacz_enricher.extract_media = False
|
|
wacz_enricher.extract_screenshot = True
|
|
mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL")
|
|
mocker.patch("subprocess.run", side_effect=Exception("fail"))
|
|
basic_metadata = Metadata().set_url("https://example.com")
|
|
result = wacz_enricher.enrich(basic_metadata)
|
|
assert result is False
|
|
|
|
|
|
def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
|
|
"""Test extract_media_from_wacz extracts screenshot media."""
|
|
wacz_enricher.tmp_dir = str(tmp_path)
|
|
|
|
# Create a *real* zip file so ZipFile won't fail.
|
|
wacz_file = tmp_path / "dummy.wacz"
|
|
with ZipFile(wacz_file, "w") as zf:
|
|
zf.writestr("dummy.txt", "test content")
|
|
|
|
mocker.patch("os.listdir", return_value=[])
|
|
warc_data = (
|
|
b"WARC/1.0\r\n"
|
|
b"WARC-Type: resource\r\n"
|
|
b"Content-Type: image/png\r\n"
|
|
b"WARC-Target-URI: http://example.com/image.png\r\n"
|
|
b"Content-Length: 12\r\n"
|
|
b"\r\n"
|
|
b"image-bytes"
|
|
b"\r\n\r\nWARC/1.0\r\n\r\n"
|
|
)
|
|
mock_file = mocker.mock_open(read_data=warc_data)
|
|
mocker.patch("builtins.open", mock_file)
|
|
metadata.add_media(Media("something.wacz"), "browsertrix")
|
|
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
|
|
assert len(metadata.media) == 2
|
|
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
|