pull/194/head
erinhmclark 2025-02-12 19:32:40 +00:00
rodzic d9d936c2ca
commit cbe98c729d
7 zmienionych plików z 538 dodań i 2 usunięć

Wyświetl plik

@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
def __init__(self, webdriver_factory=None):
super().__init__()
self.webdriver_factory = webdriver_factory or Webdriver
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
with self.webdriver_factory(
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try:
driver.get(url)
@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")

Wyświetl plik

@ -6,6 +6,8 @@ import pickle
from tempfile import TemporaryDirectory
from typing import Dict, Tuple
import hashlib
from unittest.mock import patch
import pytest
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES
@ -128,4 +130,20 @@ def unpickle():
test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files")
with open(os.path.join(test_data_dir, path), "rb") as f:
return pickle.load(f)
return _unpickle
return _unpickle
@pytest.fixture
def mock_python_dependencies():
with patch("auto_archiver.core.module") as mock_check_python_dep:
# Mock all Python dependencies as available
mock_check_python_dep.return_value = True
yield mock_check_python_dep
@pytest.fixture
def mock_binary_dependencies():
with patch("shutil.which") as mock_shutil_which:
# Mock all binary dependencies as available
mock_shutil_which.return_value = "/usr/bin/fake_binary"
yield mock_shutil_which

Wyświetl plik

@ -0,0 +1,76 @@
from unittest.mock import MagicMock, patch, Mock
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def mock_media():
"""Creates a mock Media object."""
mock: Media = MagicMock(spec=Media)
mock.filename = "mock_file.txt"
return mock
@pytest.fixture
def enricher(setup_module):
return setup_module("metadata_enricher", {})
@pytest.mark.parametrize(
"output,expected",
[
("Key1: Value1\nKey2: Value2", {"Key1": "Value1", "Key2": "Value2"}),
("InvalidLine", {}),
("", {}),
],
)
@patch("subprocess.run")
def test_get_metadata(mock_run, enricher, output, expected):
mock_run.return_value.stdout = output
mock_run.return_value.stderr = ""
mock_run.return_value.returncode = 0
result = enricher.get_metadata("test.jpg")
assert result == expected
mock_run.assert_called_once_with(
["exiftool", "test.jpg"], capture_output=True, text=True
)
@patch("subprocess.run")
def test_get_metadata_exiftool_not_found(mock_run, enricher):
mock_run.side_effect = FileNotFoundError
result = enricher.get_metadata("test.jpg")
assert result == {}
def test_enrich_sets_metadata(enricher):
media1 = Mock(filename="img1.jpg")
media2 = Mock(filename="img2.jpg")
metadata = Mock()
metadata.media = [media1, media2]
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
enricher.enrich(metadata)
media1.set.assert_called_once_with("metadata", {"key": "value"})
media2.set.assert_not_called()
assert metadata.media == [media1, media2]
def test_enrich_empty_media(enricher):
metadata = Mock()
metadata.media = []
# Should not raise errors
enricher.enrich(metadata)
@patch("loguru.logger.error")
@patch("subprocess.run")
def test_get_metadata_error_handling(mock_run, mock_logger_error, enricher):
mock_run.side_effect = Exception("Test error")
result = enricher.get_metadata("test.jpg")
assert result == {}
mock_logger_error.assert_called_once()

Wyświetl plik

@ -0,0 +1,84 @@
from unittest.mock import patch
import pytest
from PIL import UnidentifiedImageError
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.pdq_hash_enricher import PdqHashEnricher
@pytest.fixture
def enricher(setup_module):
return setup_module("pdq_hash_enricher", {})
@pytest.fixture
def metadata_with_images():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="image1.jpg", key="image1"))
m.add_media(Media(filename="image2.jpg", key="image2"))
return m
def test_successful_enrich(metadata_with_images):
with (
patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
patch("PIL.Image.open"),
patch.object(Media, "is_image", return_value=True) as mock_is_image,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
# Ensure the hash is set for image media
for media in metadata_with_images.media:
assert media.get("pdq_hash") is not None
def test_enrich_skip_non_image(metadata_with_images):
with (
patch.object(Media, "is_image", return_value=False),
patch("pdqhash.compute") as mock_pdq,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
mock_pdq.assert_not_called()
def test_enrich_handles_corrupted_image(metadata_with_images):
with (
patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")),
patch("pdqhash.compute") as mock_pdq,
patch("loguru.logger.error") as mock_logger,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
assert mock_logger.call_count == len(metadata_with_images.media)
mock_pdq.assert_not_called()
@pytest.mark.parametrize(
"media_id, should_have_hash",
[
("screenshot", False),
("warc-file-123", False),
("regular-image", True),
]
)
def test_enrich_excludes_by_filetype(media_id, should_have_hash):
metadata = Metadata()
metadata.set_url("https://example.com")
metadata.add_media(Media(filename="image.jpg").set("id", media_id))
with (
patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
patch("PIL.Image.open"),
patch.object(Media, "is_image", return_value=True),
):
enricher = PdqHashEnricher()
enricher.enrich(metadata)
media_item = metadata.media[0]
assert (media_item.get("pdq_hash") is not None) == should_have_hash

Wyświetl plik

@ -0,0 +1,205 @@
import base64
from unittest.mock import patch, MagicMock
import pytest
from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher
@pytest.fixture
def mock_selenium_env():
# Patches Selenium calls and driver checks in one place.
with (
patch("shutil.which") as mock_which,
patch("auto_archiver.utils.webdriver.CookieSettingDriver") as mock_driver_class,
patch(
"selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths"
) as mock_binary_paths,
patch("pathlib.Path.is_file", return_value=True),
patch("subprocess.Popen") as mock_popen,
patch(
"selenium.webdriver.common.service.Service.is_connectable",
return_value=True,
),
patch("selenium.webdriver.FirefoxOptions") as mock_firefox_options,
):
# Mock driver existence
def mock_which_side_effect(dep):
return "/mock/geckodriver" if dep == "geckodriver" else None
mock_which.side_effect = mock_which_side_effect
# Mock binary paths
mock_binary_paths.return_value = {
"driver_path": "/mock/driver",
"browser_path": "/mock/browser",
}
# Popen
mock_proc = MagicMock()
mock_proc.poll.return_value = None
mock_popen.return_value = mock_proc
# CookieSettingDriver -> returns a mock driver
mock_driver = MagicMock()
mock_driver_class.return_value = mock_driver
# FirefoxOptions
mock_options_instance = MagicMock()
mock_firefox_options.return_value = mock_options_instance
yield mock_driver, mock_driver_class, mock_options_instance
@pytest.fixture
def common_patches(tmp_path):
with (
patch("auto_archiver.utils.url.is_auth_wall", return_value=False),
patch("os.path.join", return_value=str(tmp_path / "test.png")),
patch("time.sleep"),
):
yield
@pytest.fixture
def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher:
configs: dict = {
"width": 1280,
"height": 720,
"timeout": 60,
"sleep_before_screenshot": 4,
"http_proxy": "",
"save_to_pdf": "False",
"print_options": {},
}
return setup_module("screenshot_enricher", configs)
@pytest.fixture
def metadata_with_video():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="video.mp4").set("id", "video1"))
return m
def test_enrich_adds_screenshot(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
tmp_path,
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
screenshot_enricher.enrich(metadata_with_video)
mock_driver_class.assert_called_once_with(
cookies=None,
cookiejar=None,
facebook_accept_cookies=False,
options=mock_options_instance,
)
# Verify the actual calls on the returned mock_driver
mock_driver.get.assert_called_once_with("https://example.com")
mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png"))
# Check that the media was added (2 = original video + screenshot)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
@pytest.mark.parametrize(
"url,is_auth",
[
("https://example.com", False),
("https://private.com", True),
],
)
def test_enrich_auth_wall(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
url,
is_auth,
):
# Testing with and without is_auth_wall
mock_driver, mock_driver_class, _ = mock_selenium_env
with patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth):
metadata_with_video.set_url(url)
screenshot_enricher.enrich(metadata_with_video)
if is_auth:
mock_driver.get.assert_not_called()
assert len(metadata_with_video.media) == 1
assert metadata_with_video.media[0].properties.get("id") == "video1"
else:
mock_driver.get.assert_called_once_with(url)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_handle_timeout_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
mock_driver.get.side_effect = TimeoutException
with patch("loguru.logger.info") as mock_log:
screenshot_enricher.enrich(metadata_with_video)
mock_log.assert_called_once_with("TimeoutException loading page for screenshot")
assert len(metadata_with_video.media) == 1
def test_handle_general_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env
):
"""Test proper handling of unexpected general exceptions"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Simulate a generic exception when save_screenshot is called
mock_driver.get.return_value = None
mock_driver.save_screenshot.side_effect = Exception("Unexpected Error")
with patch("loguru.logger.error") as mock_log:
screenshot_enricher.enrich(metadata_with_video)
# Verify that the exception was logged with the log
mock_log.assert_called_once_with(
"Got error while loading webdriver for screenshot enricher: Unexpected Error"
)
# And no new media was added due to the error
assert len(metadata_with_video.media) == 1
def test_pdf_creation(screenshot_enricher, metadata_with_video, mock_selenium_env):
"""Test PDF creation when save_to_pdf is enabled"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Override the save_to_pdf option
screenshot_enricher.save_to_pdf = True
# Mock the print_page method to return base64-encoded content
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode(
"utf-8"
)
with (
patch("os.path.join", side_effect=lambda *args: f"{args[-1]}"),
patch(
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
return_value="fixed123",
),
patch("builtins.open", new_callable=MagicMock()) as mock_open,
patch("loguru.logger.error") as mock_log,
):
screenshot_enricher.enrich(metadata_with_video)
# Verify screenshot and PDF creation
mock_driver.save_screenshot.assert_called_once()
mock_driver.print_page.assert_called_once_with(mock_driver.print_options)
# Check that PDF file was opened and written
mock_open.assert_any_call("pdf_fixed123.pdf", "wb")
# Ensure both screenshot and PDF were added as media
assert len(metadata_with_video.media) == 3 # Original video + screenshot + PDF
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
assert metadata_with_video.media[2].properties.get("id") == "pdf"
@pytest.fixture(autouse=True)
def cleanup_files(tmp_path):
yield
for file in tmp_path.iterdir():
file.unlink()

Wyświetl plik

@ -0,0 +1,54 @@
import ssl
from unittest.mock import patch, mock_open
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def enricher(setup_module):
configs: dict = {
"skip_when_nothing_archived": "True",
}
return setup_module("ssl_enricher", configs)
@pytest.fixture
def metadata():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media("tests/data/testfile_1.txt"))
m.add_media(Media("tests/data/testfile_2.txt"))
return m
def test_http_raises(metadata, enricher):
metadata.set_url("http://example.com")
with pytest.raises(AssertionError) as exc_info:
enricher.enrich(metadata)
assert "Invalid URL scheme" in str(exc_info.value)
def test_empty_metadata(metadata, enricher):
metadata.media = []
assert enricher.enrich(metadata) is None
def test_ssl_enrich(metadata, enricher):
with patch("ssl.get_server_certificate", return_value="TEST_CERT"), \
patch("builtins.open", mock_open()) as mock_file:
enricher.enrich(metadata)
ssl.get_server_certificate.assert_called_once_with(("example.com", 443))
mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w")
mock_file().write.assert_called_once_with("TEST_CERT")
# Ensure the certificate is added to metadata
assert any(media.filename.endswith("example-com.pem") for media in metadata.media)
def test_ssl_error_handling(enricher, metadata):
with patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error")):
with pytest.raises(ssl.SSLError, match="SSL error"):
enricher.enrich(metadata)

Wyświetl plik

@ -0,0 +1,93 @@
import shutil
import sys
import pytest
from unittest.mock import MagicMock, patch
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.modules.whisper_enricher import WhisperEnricher
@pytest.fixture
def enricher():
"""Fixture with mocked S3 and API dependencies"""
config = {
"api_endpoint": "http://testapi",
"api_key": "whisper-key",
"include_srt": False,
"timeout": 5,
"action": "translate",
"steps": {"storages": ["s3_storage"]}
}
mock_s3 = MagicMock(spec=S3Storage)
mock_s3.get_cdn_url.return_value = "http://s3.example.com/media.mp3"
instance = WhisperEnricher()
instance.name = "whisper_enricher"
instance.display_name = "Whisper Enricher"
instance.config_setup({instance.name: config})
# bypassing the setup method and mocking S3 setup
instance.stores = config['steps']['storages']
instance.s3 = mock_s3
yield instance, mock_s3
@pytest.fixture
def metadata():
metadata = Metadata()
metadata.set_url("http://test.url")
metadata.set_title("test title")
return metadata
@pytest.fixture
def mock_requests():
with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests:
mock_response = MagicMock()
mock_response.status_code = 201
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
yield mock_requests
def test_successful_job_submission(enricher, metadata, mock_requests):
"""Test successful media processing with S3 configured"""
whisper, mock_s3 = enricher
# Configure mock S3 URL to match test expectation
mock_s3.get_cdn_url.return_value = "http://cdn.example.com/test.mp4"
# Create test media with matching CDN URL
m = Media("test.mp4")
m.mimetype = "video/mp4"
m.add_url(mock_s3.get_cdn_url.return_value)
metadata.media = [m]
# Mock the complete API interaction chain
mock_status_response = MagicMock()
mock_status_response.status_code = 200
mock_status_response.json.return_value = {
"status": "success",
"meta": {}
}
mock_artifacts_response = MagicMock()
mock_artifacts_response.status_code = 200
mock_artifacts_response.json.return_value = [{
"data": [{"start": 0, "end": 5, "text": "test transcript"}]
}]
# Set up mock response sequence
mock_requests.get.side_effect = [
mock_status_response, # First call: status check
mock_artifacts_response # Second call: artifacts check
]
# Run enrichment (without opening file)
whisper.enrich(metadata)
# Check API interactions
mock_requests.post.assert_called_once_with(
"http://testapi/jobs",
json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
headers={"Authorization": "Bearer whisper-key"}
)
# Verify job status checks
assert mock_requests.get.call_count == 2
assert "artifact_0_text" in metadata.media[0].get("whisper_model")
assert "test transcript" in metadata.metadata.get("content")