From cbe98c729d9a4928889b5ece924eb62db1f41017 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 12 Feb 2025 19:32:40 +0000 Subject: [PATCH] Enricher tests --- .../screenshot_enricher.py | 8 +- tests/conftest.py | 20 +- tests/enrichers/test_metadata_enricher.py | 76 +++++++ tests/enrichers/test_pdq_hash_enricher.py | 84 +++++++ tests/enrichers/test_screenshot_enricher.py | 205 ++++++++++++++++++ tests/enrichers/test_ssl_enricher.py | 54 +++++ tests/enrichers/test_whisper_enricher.py | 93 ++++++++ 7 files changed, 538 insertions(+), 2 deletions(-) create mode 100644 tests/enrichers/test_metadata_enricher.py create mode 100644 tests/enrichers/test_pdq_hash_enricher.py create mode 100644 tests/enrichers/test_screenshot_enricher.py create mode 100644 tests/enrichers/test_ssl_enricher.py create mode 100644 tests/enrichers/test_whisper_enricher.py diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index e1da99d..832d0f8 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata class ScreenshotEnricher(Enricher): + def __init__(self, webdriver_factory=None): + super().__init__() + self.webdriver_factory = webdriver_factory or Webdriver + def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() @@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher): logger.debug(f"Enriching screenshot for {url=}") auth = self.auth_for_site(url) - with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url, + with self.webdriver_factory( + self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver: try: driver.get(url) @@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher): logger.info("TimeoutException loading page for screenshot") except Exception as e: logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") + diff --git a/tests/conftest.py b/tests/conftest.py index 8675fbc..d7f484f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,8 @@ import pickle from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib +from unittest.mock import patch + import pytest from auto_archiver.core.metadata import Metadata from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES @@ -128,4 +130,20 @@ def unpickle(): test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files") with open(os.path.join(test_data_dir, path), "rb") as f: return pickle.load(f) - return _unpickle \ No newline at end of file + return _unpickle + + +@pytest.fixture +def mock_python_dependencies(): + with patch("auto_archiver.core.module") as mock_check_python_dep: + # Mock all Python dependencies as available + mock_check_python_dep.return_value = True + yield mock_check_python_dep + + +@pytest.fixture +def mock_binary_dependencies(): + with patch("shutil.which") as mock_shutil_which: + # Mock all binary dependencies as available + mock_shutil_which.return_value = "/usr/bin/fake_binary" + yield mock_shutil_which diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py new file mode 100644 index 0000000..314fca7 --- /dev/null +++ b/tests/enrichers/test_metadata_enricher.py @@ -0,0 +1,76 @@ +from unittest.mock import MagicMock, patch, Mock + +import pytest + +from auto_archiver.core import Metadata, Media + + +@pytest.fixture +def mock_media(): + """Creates a mock Media object.""" + mock: Media = MagicMock(spec=Media) + mock.filename = "mock_file.txt" + return mock + + +@pytest.fixture +def enricher(setup_module): + return setup_module("metadata_enricher", {}) + + +@pytest.mark.parametrize( + "output,expected", + [ + ("Key1: Value1\nKey2: Value2", {"Key1": "Value1", "Key2": "Value2"}), + ("InvalidLine", {}), + ("", {}), + ], +) +@patch("subprocess.run") +def test_get_metadata(mock_run, enricher, output, expected): + mock_run.return_value.stdout = output + mock_run.return_value.stderr = "" + mock_run.return_value.returncode = 0 + + result = enricher.get_metadata("test.jpg") + assert result == expected + mock_run.assert_called_once_with( + ["exiftool", "test.jpg"], capture_output=True, text=True + ) + + +@patch("subprocess.run") +def test_get_metadata_exiftool_not_found(mock_run, enricher): + mock_run.side_effect = FileNotFoundError + result = enricher.get_metadata("test.jpg") + assert result == {} + + +def test_enrich_sets_metadata(enricher): + media1 = Mock(filename="img1.jpg") + media2 = Mock(filename="img2.jpg") + metadata = Mock() + metadata.media = [media1, media2] + enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {} + + enricher.enrich(metadata) + + media1.set.assert_called_once_with("metadata", {"key": "value"}) + media2.set.assert_not_called() + assert metadata.media == [media1, media2] + + +def test_enrich_empty_media(enricher): + metadata = Mock() + metadata.media = [] + # Should not raise errors + enricher.enrich(metadata) + + +@patch("loguru.logger.error") +@patch("subprocess.run") +def test_get_metadata_error_handling(mock_run, mock_logger_error, enricher): + mock_run.side_effect = Exception("Test error") + result = enricher.get_metadata("test.jpg") + assert result == {} + mock_logger_error.assert_called_once() diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py new file mode 100644 index 0000000..e90cd22 --- /dev/null +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -0,0 +1,84 @@ +from unittest.mock import patch + +import pytest +from PIL import UnidentifiedImageError + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.pdq_hash_enricher import PdqHashEnricher + + +@pytest.fixture +def enricher(setup_module): + return setup_module("pdq_hash_enricher", {}) + + +@pytest.fixture +def metadata_with_images(): + m = Metadata() + m.set_url("https://example.com") + m.add_media(Media(filename="image1.jpg", key="image1")) + m.add_media(Media(filename="image2.jpg", key="image2")) + return m + + +def test_successful_enrich(metadata_with_images): + with ( + patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), + patch("PIL.Image.open"), + patch.object(Media, "is_image", return_value=True) as mock_is_image, + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) + + # Ensure the hash is set for image media + for media in metadata_with_images.media: + assert media.get("pdq_hash") is not None + + +def test_enrich_skip_non_image(metadata_with_images): + with ( + patch.object(Media, "is_image", return_value=False), + patch("pdqhash.compute") as mock_pdq, + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) + mock_pdq.assert_not_called() + + +def test_enrich_handles_corrupted_image(metadata_with_images): + with ( + patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")), + patch("pdqhash.compute") as mock_pdq, + patch("loguru.logger.error") as mock_logger, + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) + + assert mock_logger.call_count == len(metadata_with_images.media) + mock_pdq.assert_not_called() + + +@pytest.mark.parametrize( + "media_id, should_have_hash", + [ + ("screenshot", False), + ("warc-file-123", False), + ("regular-image", True), + ] +) +def test_enrich_excludes_by_filetype(media_id, should_have_hash): + metadata = Metadata() + metadata.set_url("https://example.com") + metadata.add_media(Media(filename="image.jpg").set("id", media_id)) + + with ( + patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), + patch("PIL.Image.open"), + patch.object(Media, "is_image", return_value=True), + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata) + + media_item = metadata.media[0] + assert (media_item.get("pdq_hash") is not None) == should_have_hash + diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py new file mode 100644 index 0000000..3998deb --- /dev/null +++ b/tests/enrichers/test_screenshot_enricher.py @@ -0,0 +1,205 @@ +import base64 +from unittest.mock import patch, MagicMock + +import pytest +from selenium.common.exceptions import TimeoutException + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher + + +@pytest.fixture +def mock_selenium_env(): + # Patches Selenium calls and driver checks in one place. + with ( + patch("shutil.which") as mock_which, + patch("auto_archiver.utils.webdriver.CookieSettingDriver") as mock_driver_class, + patch( + "selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths" + ) as mock_binary_paths, + patch("pathlib.Path.is_file", return_value=True), + patch("subprocess.Popen") as mock_popen, + patch( + "selenium.webdriver.common.service.Service.is_connectable", + return_value=True, + ), + patch("selenium.webdriver.FirefoxOptions") as mock_firefox_options, + ): + # Mock driver existence + def mock_which_side_effect(dep): + return "/mock/geckodriver" if dep == "geckodriver" else None + + mock_which.side_effect = mock_which_side_effect + # Mock binary paths + mock_binary_paths.return_value = { + "driver_path": "/mock/driver", + "browser_path": "/mock/browser", + } + # Popen + mock_proc = MagicMock() + mock_proc.poll.return_value = None + mock_popen.return_value = mock_proc + # CookieSettingDriver -> returns a mock driver + mock_driver = MagicMock() + mock_driver_class.return_value = mock_driver + # FirefoxOptions + mock_options_instance = MagicMock() + mock_firefox_options.return_value = mock_options_instance + yield mock_driver, mock_driver_class, mock_options_instance + + +@pytest.fixture +def common_patches(tmp_path): + with ( + patch("auto_archiver.utils.url.is_auth_wall", return_value=False), + patch("os.path.join", return_value=str(tmp_path / "test.png")), + patch("time.sleep"), + ): + yield + + +@pytest.fixture +def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher: + configs: dict = { + "width": 1280, + "height": 720, + "timeout": 60, + "sleep_before_screenshot": 4, + "http_proxy": "", + "save_to_pdf": "False", + "print_options": {}, + } + return setup_module("screenshot_enricher", configs) + + +@pytest.fixture +def metadata_with_video(): + m = Metadata() + m.set_url("https://example.com") + m.add_media(Media(filename="video.mp4").set("id", "video1")) + return m + + +def test_enrich_adds_screenshot( + screenshot_enricher, + metadata_with_video, + mock_selenium_env, + common_patches, + tmp_path, +): + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + screenshot_enricher.enrich(metadata_with_video) + mock_driver_class.assert_called_once_with( + cookies=None, + cookiejar=None, + facebook_accept_cookies=False, + options=mock_options_instance, + ) + # Verify the actual calls on the returned mock_driver + mock_driver.get.assert_called_once_with("https://example.com") + mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png")) + # Check that the media was added (2 = original video + screenshot) + assert len(metadata_with_video.media) == 2 + assert metadata_with_video.media[1].properties.get("id") == "screenshot" + + +@pytest.mark.parametrize( + "url,is_auth", + [ + ("https://example.com", False), + ("https://private.com", True), + ], +) +def test_enrich_auth_wall( + screenshot_enricher, + metadata_with_video, + mock_selenium_env, + common_patches, + url, + is_auth, +): + # Testing with and without is_auth_wall + mock_driver, mock_driver_class, _ = mock_selenium_env + with patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth): + metadata_with_video.set_url(url) + screenshot_enricher.enrich(metadata_with_video) + + if is_auth: + mock_driver.get.assert_not_called() + assert len(metadata_with_video.media) == 1 + assert metadata_with_video.media[0].properties.get("id") == "video1" + else: + mock_driver.get.assert_called_once_with(url) + assert len(metadata_with_video.media) == 2 + assert metadata_with_video.media[1].properties.get("id") == "screenshot" + + +def test_handle_timeout_exception( + screenshot_enricher, metadata_with_video, mock_selenium_env +): + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + + mock_driver.get.side_effect = TimeoutException + with patch("loguru.logger.info") as mock_log: + screenshot_enricher.enrich(metadata_with_video) + mock_log.assert_called_once_with("TimeoutException loading page for screenshot") + assert len(metadata_with_video.media) == 1 + + +def test_handle_general_exception( + screenshot_enricher, metadata_with_video, mock_selenium_env +): + """Test proper handling of unexpected general exceptions""" + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + # Simulate a generic exception when save_screenshot is called + mock_driver.get.return_value = None + mock_driver.save_screenshot.side_effect = Exception("Unexpected Error") + + with patch("loguru.logger.error") as mock_log: + screenshot_enricher.enrich(metadata_with_video) + # Verify that the exception was logged with the log + mock_log.assert_called_once_with( + "Got error while loading webdriver for screenshot enricher: Unexpected Error" + ) + # And no new media was added due to the error + assert len(metadata_with_video.media) == 1 + + +def test_pdf_creation(screenshot_enricher, metadata_with_video, mock_selenium_env): + """Test PDF creation when save_to_pdf is enabled""" + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + + # Override the save_to_pdf option + screenshot_enricher.save_to_pdf = True + # Mock the print_page method to return base64-encoded content + mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode( + "utf-8" + ) + with ( + patch("os.path.join", side_effect=lambda *args: f"{args[-1]}"), + patch( + "auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str", + return_value="fixed123", + ), + patch("builtins.open", new_callable=MagicMock()) as mock_open, + patch("loguru.logger.error") as mock_log, + ): + screenshot_enricher.enrich(metadata_with_video) + + # Verify screenshot and PDF creation + mock_driver.save_screenshot.assert_called_once() + mock_driver.print_page.assert_called_once_with(mock_driver.print_options) + + # Check that PDF file was opened and written + mock_open.assert_any_call("pdf_fixed123.pdf", "wb") + # Ensure both screenshot and PDF were added as media + assert len(metadata_with_video.media) == 3 # Original video + screenshot + PDF + assert metadata_with_video.media[1].properties.get("id") == "screenshot" + assert metadata_with_video.media[2].properties.get("id") == "pdf" + + +@pytest.fixture(autouse=True) +def cleanup_files(tmp_path): + yield + for file in tmp_path.iterdir(): + file.unlink() diff --git a/tests/enrichers/test_ssl_enricher.py b/tests/enrichers/test_ssl_enricher.py new file mode 100644 index 0000000..c4d2dc5 --- /dev/null +++ b/tests/enrichers/test_ssl_enricher.py @@ -0,0 +1,54 @@ +import ssl +from unittest.mock import patch, mock_open + +import pytest + +from auto_archiver.core import Metadata, Media + + +@pytest.fixture +def enricher(setup_module): + configs: dict = { + "skip_when_nothing_archived": "True", + } + return setup_module("ssl_enricher", configs) + + +@pytest.fixture +def metadata(): + m = Metadata() + m.set_url("https://example.com") + m.add_media(Media("tests/data/testfile_1.txt")) + m.add_media(Media("tests/data/testfile_2.txt")) + return m + + +def test_http_raises(metadata, enricher): + metadata.set_url("http://example.com") + with pytest.raises(AssertionError) as exc_info: + enricher.enrich(metadata) + assert "Invalid URL scheme" in str(exc_info.value) + + +def test_empty_metadata(metadata, enricher): + metadata.media = [] + assert enricher.enrich(metadata) is None + + +def test_ssl_enrich(metadata, enricher): + with patch("ssl.get_server_certificate", return_value="TEST_CERT"), \ + patch("builtins.open", mock_open()) as mock_file: + enricher.enrich(metadata) + + ssl.get_server_certificate.assert_called_once_with(("example.com", 443)) + mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w") + mock_file().write.assert_called_once_with("TEST_CERT") + # Ensure the certificate is added to metadata + assert any(media.filename.endswith("example-com.pem") for media in metadata.media) + + +def test_ssl_error_handling(enricher, metadata): + with patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error")): + with pytest.raises(ssl.SSLError, match="SSL error"): + enricher.enrich(metadata) + diff --git a/tests/enrichers/test_whisper_enricher.py b/tests/enrichers/test_whisper_enricher.py new file mode 100644 index 0000000..8a73ed7 --- /dev/null +++ b/tests/enrichers/test_whisper_enricher.py @@ -0,0 +1,93 @@ +import shutil +import sys +import pytest +from unittest.mock import MagicMock, patch +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.s3_storage import S3Storage + +from auto_archiver.modules.whisper_enricher import WhisperEnricher + + +@pytest.fixture +def enricher(): + """Fixture with mocked S3 and API dependencies""" + config = { + "api_endpoint": "http://testapi", + "api_key": "whisper-key", + "include_srt": False, + "timeout": 5, + "action": "translate", + "steps": {"storages": ["s3_storage"]} + } + mock_s3 = MagicMock(spec=S3Storage) + mock_s3.get_cdn_url.return_value = "http://s3.example.com/media.mp3" + instance = WhisperEnricher() + instance.name = "whisper_enricher" + instance.display_name = "Whisper Enricher" + instance.config_setup({instance.name: config}) + # bypassing the setup method and mocking S3 setup + instance.stores = config['steps']['storages'] + instance.s3 = mock_s3 + yield instance, mock_s3 + + +@pytest.fixture +def metadata(): + metadata = Metadata() + metadata.set_url("http://test.url") + metadata.set_title("test title") + return metadata + + +@pytest.fixture +def mock_requests(): + with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests: + mock_response = MagicMock() + mock_response.status_code = 201 + mock_response.json.return_value = {"id": "job123"} + mock_requests.post.return_value = mock_response + yield mock_requests + + +def test_successful_job_submission(enricher, metadata, mock_requests): + """Test successful media processing with S3 configured""" + whisper, mock_s3 = enricher + # Configure mock S3 URL to match test expectation + mock_s3.get_cdn_url.return_value = "http://cdn.example.com/test.mp4" + + # Create test media with matching CDN URL + m = Media("test.mp4") + m.mimetype = "video/mp4" + m.add_url(mock_s3.get_cdn_url.return_value) + metadata.media = [m] + + # Mock the complete API interaction chain + mock_status_response = MagicMock() + mock_status_response.status_code = 200 + mock_status_response.json.return_value = { + "status": "success", + "meta": {} + } + mock_artifacts_response = MagicMock() + mock_artifacts_response.status_code = 200 + mock_artifacts_response.json.return_value = [{ + "data": [{"start": 0, "end": 5, "text": "test transcript"}] + }] + # Set up mock response sequence + mock_requests.get.side_effect = [ + mock_status_response, # First call: status check + mock_artifacts_response # Second call: artifacts check + ] + # Run enrichment (without opening file) + whisper.enrich(metadata) + # Check API interactions + mock_requests.post.assert_called_once_with( + "http://testapi/jobs", + json={"url": "http://cdn.example.com/test.mp4", "type": "translate"}, + headers={"Authorization": "Bearer whisper-key"} + ) + # Verify job status checks + assert mock_requests.get.call_count == 2 + assert "artifact_0_text" in metadata.media[0].get("whisper_model") + assert "test transcript" in metadata.metadata.get("content") +