diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 969c70f..6071fb1 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -3,6 +3,7 @@ import math import mimetypes import os import sys +import traceback from urllib.parse import urljoin from loguru import logger @@ -75,7 +76,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): return to_enrich except Exception as e: - logger.error(f"ANTIBOT runtime error: {e}") + logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}") return False def _hit_auth_wall(self, sb: SB) -> bool: diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py new file mode 100644 index 0000000..0dd166c --- /dev/null +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -0,0 +1,173 @@ +import pytest + +from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher +from .test_extractor_base import TestExtractorBase + + +class DummySB: + def __init__(self, url="", title="", visible_texts=None, visible_elements=None): + self._url = url + self._title = title + self._visible_texts = visible_texts or set() + self._visible_elements = visible_elements or set() + + def get_current_url(self): + return self._url + + def get_title(self): + return self._title + + def is_text_visible(self, text): + return text in self._visible_texts + + def is_element_visible(self, selector): + return selector in self._visible_elements + + +class TestAntibotExtractorEnricher(TestExtractorBase): + """Tests Antibot Extractor/Enricher""" + + extractor_module = "antibot_extractor_enricher" + extractor: AntibotExtractorEnricher + + config = { + "save_to_pdf": False, + "max_download_images": 0, + "max_download_videos": 0, + "exclude_media_extensions": ".svg,.ico,.gif", + "proxy": None, + } + + @pytest.mark.download + @pytest.mark.parametrize( + "url,in_title,image_count,video_count", + [ + ( + "https://en.wikipedia.org/wiki/Western_barn_owl", + "western barn owl", + 5, + 0, + ), + ( + "https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/", + "open sources show myanmar", + 5, + 0, + ), + ( + "https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/", + "shot from above", + 5, + 1, + ), + ( + "https://www.bellingcat.com/about/general-information", + "general information", + 0, # SVGs are ignored + 0, + ), + ], + ) + def test_download_pages_with_media(self, setup_module, make_item, url, in_title, image_count, video_count): + """ + Test downloading pages with media. + """ + + self.extractor = setup_module( + self.extractor_module, + { + "save_to_pdf": True, + "max_download_images": 5, + "max_download_videos": "inf", + }, + ) + + item = make_item(url) + result = self.extractor.download(item) + + assert result.status == "antibot", "Expected status to be 'antibot'" + + # Check title contains all required words (case-insensitive) + page_title = result.get_title() or "" + assert in_title in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'" + + image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"] + assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}" + video_media = [m for m in result.media if m.is_video()] + assert len(video_media) == video_count, f"Expected {video_count} video items, got {len(video_media)}" + + for expected_id in ["screenshot", "pdf", "html_source_code"]: + assert any(m.get("id") == expected_id for m in result.media), ( + f"Expected media with id '{expected_id}' not found" + ) + + @pytest.mark.download + @pytest.mark.parametrize( + "url,in_html", + [ + ( + "https://myrotvorets.center/about/", + "Центр «Миротворець»", + ), + ( + "https://seleniumbase.io/apps/turnstile", + 'id="captcha-success"', + ), + ], + ) + def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html): + """ + Test downloading a page with Cloudflare Turnstile captcha. + """ + + item = make_item(url) + self.extractor.enrich(item) + + assert item.status != "antibot", "Expected status not to be 'antibot' after handling Cloudflare Turnstile" + + html_media = item.get_media_by_id("html_source_code") + with open(html_media.filename, "r", encoding="utf-8") as f: + html_content = f.read() + assert in_html.lower() in html_content.lower(), f"Expected HTML to contain '{in_html}'" + + @pytest.mark.parametrize( + "url,title,visible_texts,visible_elements,expected", + [ + # URL triggers + ("https://example.com/login", "Welcome", set(), set(), True), + ("https://example.com/somepage", "Just a moment...", set(), set(), True), + ("https://example.com/", "Welcome", {"Please log in"}, set(), True), + ("https://example.com/", "Welcome", set(), {"input[type='password']"}, True), + ("https://example.com/", "Welcome", set("No issue here"), set(), False), + # Title triggers + ("https://example.com/", "Log in", set(), set(), True), + ("https://example.com/", "Verification required", set(), set(), True), + # Text triggers (case-insensitive) + ("https://example.com/", "Welcome", {"Sign up or log in"}, set(), True), + ("https://example.com/", "Welcome", {"sign up or log in"}, set(), True), + # Element triggers + ("https://example.com/", "Welcome", set(), {"input[name='email']"}, True), + # No triggers + ("https://example.com/", "Welcome", set(), set(), False), + ], + ) + def test_hit_auth_wall(self, url, title, visible_texts, visible_elements, expected): + extractor = AntibotExtractorEnricher() + sb = DummySB(url=url, title=title, visible_texts=visible_texts, visible_elements=visible_elements) + assert extractor._hit_auth_wall(sb) == expected + + def test_enrich_handles_sb_exception(self, make_item, mocker): + """ + Test that enrich returns False and logs error if SB raises an exception. + """ + + # Patch SB to raise an exception on context enter + mock_sb = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.SB") + mock_logger = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.logger") + mock_sb.side_effect = Exception("SB failed") + + item = make_item("https://example.com/") + result = self.extractor.enrich(item) + + assert result is False + mock_logger.error.assert_called()