pull/194/head
erinhmclark 2025-02-12 19:32:40 +00:00
rodzic d9d936c2ca
commit cbe98c729d
7 zmienionych plików z 538 dodań i 2 usunięć

Wyświetl plik

@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher): class ScreenshotEnricher(Enricher):
def __init__(self, webdriver_factory=None):
super().__init__()
self.webdriver_factory = webdriver_factory or Webdriver
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()
@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
logger.debug(f"Enriching screenshot for {url=}") logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url) auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url, with self.webdriver_factory(
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver: http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try: try:
driver.get(url) driver.get(url)
@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
logger.info("TimeoutException loading page for screenshot") logger.info("TimeoutException loading page for screenshot")
except Exception as e: except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")

Wyświetl plik

@ -6,6 +6,8 @@ import pickle
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from typing import Dict, Tuple from typing import Dict, Tuple
import hashlib import hashlib
from unittest.mock import patch
import pytest import pytest
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata
from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES
@ -128,4 +130,20 @@ def unpickle():
test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files") test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files")
with open(os.path.join(test_data_dir, path), "rb") as f: with open(os.path.join(test_data_dir, path), "rb") as f:
return pickle.load(f) return pickle.load(f)
return _unpickle return _unpickle
@pytest.fixture
def mock_python_dependencies():
with patch("auto_archiver.core.module") as mock_check_python_dep:
# Mock all Python dependencies as available
mock_check_python_dep.return_value = True
yield mock_check_python_dep
@pytest.fixture
def mock_binary_dependencies():
with patch("shutil.which") as mock_shutil_which:
# Mock all binary dependencies as available
mock_shutil_which.return_value = "/usr/bin/fake_binary"
yield mock_shutil_which

Wyświetl plik

@ -0,0 +1,76 @@
from unittest.mock import MagicMock, patch, Mock
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def mock_media():
"""Creates a mock Media object."""
mock: Media = MagicMock(spec=Media)
mock.filename = "mock_file.txt"
return mock
@pytest.fixture
def enricher(setup_module):
return setup_module("metadata_enricher", {})
@pytest.mark.parametrize(
"output,expected",
[
("Key1: Value1\nKey2: Value2", {"Key1": "Value1", "Key2": "Value2"}),
("InvalidLine", {}),
("", {}),
],
)
@patch("subprocess.run")
def test_get_metadata(mock_run, enricher, output, expected):
mock_run.return_value.stdout = output
mock_run.return_value.stderr = ""
mock_run.return_value.returncode = 0
result = enricher.get_metadata("test.jpg")
assert result == expected
mock_run.assert_called_once_with(
["exiftool", "test.jpg"], capture_output=True, text=True
)
@patch("subprocess.run")
def test_get_metadata_exiftool_not_found(mock_run, enricher):
mock_run.side_effect = FileNotFoundError
result = enricher.get_metadata("test.jpg")
assert result == {}
def test_enrich_sets_metadata(enricher):
media1 = Mock(filename="img1.jpg")
media2 = Mock(filename="img2.jpg")
metadata = Mock()
metadata.media = [media1, media2]
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
enricher.enrich(metadata)
media1.set.assert_called_once_with("metadata", {"key": "value"})
media2.set.assert_not_called()
assert metadata.media == [media1, media2]
def test_enrich_empty_media(enricher):
metadata = Mock()
metadata.media = []
# Should not raise errors
enricher.enrich(metadata)
@patch("loguru.logger.error")
@patch("subprocess.run")
def test_get_metadata_error_handling(mock_run, mock_logger_error, enricher):
mock_run.side_effect = Exception("Test error")
result = enricher.get_metadata("test.jpg")
assert result == {}
mock_logger_error.assert_called_once()

Wyświetl plik

@ -0,0 +1,84 @@
from unittest.mock import patch
import pytest
from PIL import UnidentifiedImageError
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.pdq_hash_enricher import PdqHashEnricher
@pytest.fixture
def enricher(setup_module):
return setup_module("pdq_hash_enricher", {})
@pytest.fixture
def metadata_with_images():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="image1.jpg", key="image1"))
m.add_media(Media(filename="image2.jpg", key="image2"))
return m
def test_successful_enrich(metadata_with_images):
with (
patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
patch("PIL.Image.open"),
patch.object(Media, "is_image", return_value=True) as mock_is_image,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
# Ensure the hash is set for image media
for media in metadata_with_images.media:
assert media.get("pdq_hash") is not None
def test_enrich_skip_non_image(metadata_with_images):
with (
patch.object(Media, "is_image", return_value=False),
patch("pdqhash.compute") as mock_pdq,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
mock_pdq.assert_not_called()
def test_enrich_handles_corrupted_image(metadata_with_images):
with (
patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")),
patch("pdqhash.compute") as mock_pdq,
patch("loguru.logger.error") as mock_logger,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
assert mock_logger.call_count == len(metadata_with_images.media)
mock_pdq.assert_not_called()
@pytest.mark.parametrize(
"media_id, should_have_hash",
[
("screenshot", False),
("warc-file-123", False),
("regular-image", True),
]
)
def test_enrich_excludes_by_filetype(media_id, should_have_hash):
metadata = Metadata()
metadata.set_url("https://example.com")
metadata.add_media(Media(filename="image.jpg").set("id", media_id))
with (
patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
patch("PIL.Image.open"),
patch.object(Media, "is_image", return_value=True),
):
enricher = PdqHashEnricher()
enricher.enrich(metadata)
media_item = metadata.media[0]
assert (media_item.get("pdq_hash") is not None) == should_have_hash

Wyświetl plik

@ -0,0 +1,205 @@
import base64
from unittest.mock import patch, MagicMock
import pytest
from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher
@pytest.fixture
def mock_selenium_env():
# Patches Selenium calls and driver checks in one place.
with (
patch("shutil.which") as mock_which,
patch("auto_archiver.utils.webdriver.CookieSettingDriver") as mock_driver_class,
patch(
"selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths"
) as mock_binary_paths,
patch("pathlib.Path.is_file", return_value=True),
patch("subprocess.Popen") as mock_popen,
patch(
"selenium.webdriver.common.service.Service.is_connectable",
return_value=True,
),
patch("selenium.webdriver.FirefoxOptions") as mock_firefox_options,
):
# Mock driver existence
def mock_which_side_effect(dep):
return "/mock/geckodriver" if dep == "geckodriver" else None
mock_which.side_effect = mock_which_side_effect
# Mock binary paths
mock_binary_paths.return_value = {
"driver_path": "/mock/driver",
"browser_path": "/mock/browser",
}
# Popen
mock_proc = MagicMock()
mock_proc.poll.return_value = None
mock_popen.return_value = mock_proc
# CookieSettingDriver -> returns a mock driver
mock_driver = MagicMock()
mock_driver_class.return_value = mock_driver
# FirefoxOptions
mock_options_instance = MagicMock()
mock_firefox_options.return_value = mock_options_instance
yield mock_driver, mock_driver_class, mock_options_instance
@pytest.fixture
def common_patches(tmp_path):
with (
patch("auto_archiver.utils.url.is_auth_wall", return_value=False),
patch("os.path.join", return_value=str(tmp_path / "test.png")),
patch("time.sleep"),
):
yield
@pytest.fixture
def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher:
configs: dict = {
"width": 1280,
"height": 720,
"timeout": 60,
"sleep_before_screenshot": 4,
"http_proxy": "",
"save_to_pdf": "False",
"print_options": {},
}
return setup_module("screenshot_enricher", configs)
@pytest.fixture
def metadata_with_video():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="video.mp4").set("id", "video1"))
return m
def test_enrich_adds_screenshot(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
tmp_path,
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
screenshot_enricher.enrich(metadata_with_video)
mock_driver_class.assert_called_once_with(
cookies=None,
cookiejar=None,
facebook_accept_cookies=False,
options=mock_options_instance,
)
# Verify the actual calls on the returned mock_driver
mock_driver.get.assert_called_once_with("https://example.com")
mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png"))
# Check that the media was added (2 = original video + screenshot)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
@pytest.mark.parametrize(
"url,is_auth",
[
("https://example.com", False),
("https://private.com", True),
],
)
def test_enrich_auth_wall(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
url,
is_auth,
):
# Testing with and without is_auth_wall
mock_driver, mock_driver_class, _ = mock_selenium_env
with patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth):
metadata_with_video.set_url(url)
screenshot_enricher.enrich(metadata_with_video)
if is_auth:
mock_driver.get.assert_not_called()
assert len(metadata_with_video.media) == 1
assert metadata_with_video.media[0].properties.get("id") == "video1"
else:
mock_driver.get.assert_called_once_with(url)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_handle_timeout_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
mock_driver.get.side_effect = TimeoutException
with patch("loguru.logger.info") as mock_log:
screenshot_enricher.enrich(metadata_with_video)
mock_log.assert_called_once_with("TimeoutException loading page for screenshot")
assert len(metadata_with_video.media) == 1
def test_handle_general_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env
):
"""Test proper handling of unexpected general exceptions"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Simulate a generic exception when save_screenshot is called
mock_driver.get.return_value = None
mock_driver.save_screenshot.side_effect = Exception("Unexpected Error")
with patch("loguru.logger.error") as mock_log:
screenshot_enricher.enrich(metadata_with_video)
# Verify that the exception was logged with the log
mock_log.assert_called_once_with(
"Got error while loading webdriver for screenshot enricher: Unexpected Error"
)
# And no new media was added due to the error
assert len(metadata_with_video.media) == 1
def test_pdf_creation(screenshot_enricher, metadata_with_video, mock_selenium_env):
"""Test PDF creation when save_to_pdf is enabled"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Override the save_to_pdf option
screenshot_enricher.save_to_pdf = True
# Mock the print_page method to return base64-encoded content
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode(
"utf-8"
)
with (
patch("os.path.join", side_effect=lambda *args: f"{args[-1]}"),
patch(
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
return_value="fixed123",
),
patch("builtins.open", new_callable=MagicMock()) as mock_open,
patch("loguru.logger.error") as mock_log,
):
screenshot_enricher.enrich(metadata_with_video)
# Verify screenshot and PDF creation
mock_driver.save_screenshot.assert_called_once()
mock_driver.print_page.assert_called_once_with(mock_driver.print_options)
# Check that PDF file was opened and written
mock_open.assert_any_call("pdf_fixed123.pdf", "wb")
# Ensure both screenshot and PDF were added as media
assert len(metadata_with_video.media) == 3 # Original video + screenshot + PDF
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
assert metadata_with_video.media[2].properties.get("id") == "pdf"
@pytest.fixture(autouse=True)
def cleanup_files(tmp_path):
yield
for file in tmp_path.iterdir():
file.unlink()

Wyświetl plik

@ -0,0 +1,54 @@
import ssl
from unittest.mock import patch, mock_open
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def enricher(setup_module):
configs: dict = {
"skip_when_nothing_archived": "True",
}
return setup_module("ssl_enricher", configs)
@pytest.fixture
def metadata():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media("tests/data/testfile_1.txt"))
m.add_media(Media("tests/data/testfile_2.txt"))
return m
def test_http_raises(metadata, enricher):
metadata.set_url("http://example.com")
with pytest.raises(AssertionError) as exc_info:
enricher.enrich(metadata)
assert "Invalid URL scheme" in str(exc_info.value)
def test_empty_metadata(metadata, enricher):
metadata.media = []
assert enricher.enrich(metadata) is None
def test_ssl_enrich(metadata, enricher):
with patch("ssl.get_server_certificate", return_value="TEST_CERT"), \
patch("builtins.open", mock_open()) as mock_file:
enricher.enrich(metadata)
ssl.get_server_certificate.assert_called_once_with(("example.com", 443))
mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w")
mock_file().write.assert_called_once_with("TEST_CERT")
# Ensure the certificate is added to metadata
assert any(media.filename.endswith("example-com.pem") for media in metadata.media)
def test_ssl_error_handling(enricher, metadata):
with patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error")):
with pytest.raises(ssl.SSLError, match="SSL error"):
enricher.enrich(metadata)

Wyświetl plik

@ -0,0 +1,93 @@
import shutil
import sys
import pytest
from unittest.mock import MagicMock, patch
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.modules.whisper_enricher import WhisperEnricher
@pytest.fixture
def enricher():
"""Fixture with mocked S3 and API dependencies"""
config = {
"api_endpoint": "http://testapi",
"api_key": "whisper-key",
"include_srt": False,
"timeout": 5,
"action": "translate",
"steps": {"storages": ["s3_storage"]}
}
mock_s3 = MagicMock(spec=S3Storage)
mock_s3.get_cdn_url.return_value = "http://s3.example.com/media.mp3"
instance = WhisperEnricher()
instance.name = "whisper_enricher"
instance.display_name = "Whisper Enricher"
instance.config_setup({instance.name: config})
# bypassing the setup method and mocking S3 setup
instance.stores = config['steps']['storages']
instance.s3 = mock_s3
yield instance, mock_s3
@pytest.fixture
def metadata():
metadata = Metadata()
metadata.set_url("http://test.url")
metadata.set_title("test title")
return metadata
@pytest.fixture
def mock_requests():
with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests:
mock_response = MagicMock()
mock_response.status_code = 201
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
yield mock_requests
def test_successful_job_submission(enricher, metadata, mock_requests):
"""Test successful media processing with S3 configured"""
whisper, mock_s3 = enricher
# Configure mock S3 URL to match test expectation
mock_s3.get_cdn_url.return_value = "http://cdn.example.com/test.mp4"
# Create test media with matching CDN URL
m = Media("test.mp4")
m.mimetype = "video/mp4"
m.add_url(mock_s3.get_cdn_url.return_value)
metadata.media = [m]
# Mock the complete API interaction chain
mock_status_response = MagicMock()
mock_status_response.status_code = 200
mock_status_response.json.return_value = {
"status": "success",
"meta": {}
}
mock_artifacts_response = MagicMock()
mock_artifacts_response.status_code = 200
mock_artifacts_response.json.return_value = [{
"data": [{"start": 0, "end": 5, "text": "test transcript"}]
}]
# Set up mock response sequence
mock_requests.get.side_effect = [
mock_status_response, # First call: status check
mock_artifacts_response # Second call: artifacts check
]
# Run enrichment (without opening file)
whisper.enrich(metadata)
# Check API interactions
mock_requests.post.assert_called_once_with(
"http://testapi/jobs",
json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
headers={"Authorization": "Bearer whisper-key"}
)
# Verify job status checks
assert mock_requests.get.call_count == 2
assert "artifact_0_text" in metadata.media[0].get("whisper_model")
assert "test transcript" in metadata.metadata.get("content")