auto-archiver/tests/extractors/test_antibot_extractor_enri...

265 wiersze
10 KiB
Python

import os
import pytest
from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher
from .test_extractor_base import TestExtractorBase
class DummySB:
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
self._url = url
self._title = title
self._visible_texts = visible_texts or set()
self._visible_elements = visible_elements or set()
def get_current_url(self):
return self._url
def get_title(self):
return self._title
def is_text_visible(self, text):
return text in self._visible_texts
def is_element_visible(self, selector):
return selector in self._visible_elements
class TestAntibotExtractorEnricher(TestExtractorBase):
"""Tests Antibot Extractor/Enricher"""
extractor_module = "antibot_extractor_enricher"
extractor: AntibotExtractorEnricher
config = {
"save_to_pdf": False,
"max_download_images": 0,
"max_download_videos": 0,
"user_data_dir": "./tests/tmp/user_data",
"proxy": None,
"authentication": {
"reddit.com": {
"username": os.environ.get("REDDIT_TEST_USERNAME"),
"password": os.environ.get("REDDIT_TEST_PASSWORD"),
},
"linkedin.com": {
"username": os.environ.get("LINKEDIN_TEST_USERNAME"),
"password": os.environ.get("LINKEDIN_TEST_PASSWORD"),
},
},
}
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_title,in_text,image_count,video_count",
[
(
"https://en.wikipedia.org/wiki/Western_barn_owl",
"western barn owl",
"Tyto alba",
5,
0,
),
(
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
"open sources show myanmar",
"Bellingcat has geolocated",
5,
0,
),
(
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
"shot from above",
"continued the work of Gazan journalists",
5,
1,
),
(
"https://www.bellingcat.com/about/general-information",
"general information",
"Stichting Bellingcat",
0, # SVGs are ignored
0,
),
(
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
"Hounds of Love",
"16 сентября 1985 года лейблом EMI Records.",
5,
0,
),
],
)
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
"""
Test downloading pages with media.
"""
self.extractor = setup_module(
self.extractor_module,
self.config
| {
"save_to_pdf": True,
"max_download_images": 5,
"max_download_videos": "inf",
},
)
url = self.extractor.sanitize_url(url)
item = make_item(url)
result = self.extractor.download(item)
assert result.status == "antibot", "Expected status to be 'antibot'"
# Check title contains all required words (case-insensitive)
page_title = result.get_title() or ""
assert in_title.lower() in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
# Check text contains all required words (case-insensitive)
with open(result.get_media_by_id("html_source_code").filename, "r", encoding="utf-8") as f:
html_content = f.read()
assert in_text.lower() in html_content.lower(), (
f"Expected HTML to contain '{in_text}', got '{html_content}'"
)
image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"]
assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"
video_media = [m for m in result.media if m.is_video()]
assert len(video_media) == video_count, f"Expected {video_count} video items, got {len(video_media)}"
for expected_id in ["screenshot", "pdf", "html_source_code"]:
assert any(m.get("id") == expected_id for m in result.media), (
f"Expected media with id '{expected_id}' not found"
)
@pytest.mark.skipif(
not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"),
reason="No Reddit test credentials provided",
)
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_title,in_text,image_count,video_count",
[
(
"https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/",
"Duy tran is the owner and prime wood work artist",
" Created Jan 26, 2015",
4,
0,
),
],
)
def test_reddit_download_with_login(
self, setup_module, make_item, url, in_title, in_text, image_count, video_count
):
self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
@pytest.mark.skipif(
not os.environ.get("LINKEDIN_TEST_USERNAME") or not os.environ.get("LINKEDIN_TEST_PASSWORD"),
reason="No LinkedIn test credentials provided",
)
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_title,in_text,image_count,video_count",
[
(
"https://www.linkedin.com/posts/bellingcat_live-podcast-bellingcat-activity-7331725631799398400-xocM/",
"Post",
"It takes time to go from hunch to reporting...",
2,
0,
),
],
)
def test_linkedin_download_with_login(
self, setup_module, make_item, url, in_title, in_text, image_count, video_count
):
self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_html",
[
(
"https://myrotvorets.center/about/",
"Центр «Миротворець»",
),
(
"https://seleniumbase.io/apps/turnstile",
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" style="" width="180">',
),
(
"https://seleniumbase.io/apps/form_turnstile",
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" width="120" style="">',
),
(
"https://gitlab.com/users/sign_in",
"Password",
),
],
)
def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
"""
Test downloading a page with Cloudflare Turnstile captcha.
"""
self.extractor = setup_module(
self.extractor_module,
{
"save_to_pdf": True,
"detect_auth_wall": False,
"max_download_images": 5,
"max_download_videos": "inf",
},
)
item = make_item(url)
self.extractor.enrich(item)
assert item.status != "antibot", "Expected status not to be 'antibot' after handling Cloudflare Turnstile"
html_media = item.get_media_by_id("html_source_code")
with open(html_media.filename, "r", encoding="utf-8") as f:
html_content = f.read()
assert in_html.lower() in html_content.lower(), f"Expected HTML to contain '{in_html}'"
@pytest.mark.parametrize(
"url,title,visible_texts,visible_elements,expected",
[
# URL triggers
("https://example.com/login", "Welcome", set(), set(), True),
("https://example.com/somepage", "Just a moment...", set(), set(), True),
("https://example.com/", "Welcome", {"Please log in"}, set(), True),
("https://example.com/", "Welcome", set(), {"input[type='password']"}, True),
("https://example.com/", "Welcome", set("No issue here"), set(), False),
# Title triggers
("https://example.com/", "Log in", set(), set(), True),
("https://example.com/", "Verification required", set(), set(), True),
# Text triggers (case-insensitive)
("https://example.com/", "Welcome", {"Sign up or log in"}, set(), True),
("https://example.com/", "Welcome", {"sign up or log in"}, set(), True),
# Element triggers
("https://example.com/", "Welcome", set(), {"input[name='email']"}, True),
# No triggers
("https://example.com/", "Welcome", set(), set(), False),
],
)
def test_hit_auth_wall(self, url, title, visible_texts, visible_elements, expected):
extractor = AntibotExtractorEnricher()
sb = DummySB(url=url, title=title, visible_texts=visible_texts, visible_elements=visible_elements)
assert extractor._hit_auth_wall(sb) == expected
def test_enrich_handles_sb_exception(self, make_item, mocker):
"""
Test that enrich returns False and logs error if SB raises an exception.
"""
# Patch SB to raise an exception on context enter
mock_sb = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.SB")
mock_logger = mocker.patch("auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher.logger")
mock_sb.side_effect = Exception("SB failed")
item = make_item("https://example.com/")
result = self.extractor.enrich(item)
assert result is False
mock_logger.error.assert_called()