From d13a5ef00392452658e8799686ea902d18b683ff Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 7 Jun 2025 19:58:18 +0100 Subject: [PATCH] adds tests in minor improvements --- .../antibot_extractor_enricher.py | 1 - .../antibot_extractor_enricher/dropin.py | 5 +- .../antibot_extractor_enricher/dropins/vk.py | 16 ++-- tests/extractors/test_antibot_dropin_vk.py | 81 +++++++++++++++++++ .../test_antibot_extractor_enricher.py | 26 +++++- 5 files changed, 111 insertions(+), 18 deletions(-) create mode 100644 tests/extractors/test_antibot_dropin_vk.py diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 0401468..c8dc137 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -93,7 +93,6 @@ class AntibotExtractorEnricher(Extractor, Enricher): def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool: using_user_data_dir = self.user_data_dir if custom_data_dir else None url = to_enrich.get_url() - # TODO: implement cookies auth = self.auth_for_site(url) and combine with if UrlUtil.is_auth_wall(url) like in ScreenshotEnricher url_sample = url[:75] try: diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 39e34a9..805edfd 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -24,7 +24,6 @@ class Dropin: def suitable(url: str) -> bool: """ Check if the URL is suitable for processing with this dropin. - :param url: The URL to check. :return: True if the URL is suitable for processing, False otherwise. """ @@ -33,7 +32,7 @@ class Dropin: @staticmethod def sanitize_url(url: str) -> str: """ - Used to clean unnecessary URL parameters OR unfurl redirect links + Used to clean URLs before processing them. """ return url @@ -48,8 +47,6 @@ class Dropin: def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: """ Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object. - - :return: A tuple (number of Images added, number of Videos added). """ raise NotImplementedError("This method should be implemented in the subclass") diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 9f33239..b36b517 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -21,19 +21,14 @@ class VkDropin(Dropin): @staticmethod def suitable(url: str) -> bool: - """ - Only suitable for VK URLs that match the wall, photo, or video patterns. - Otherwise, for example, for pages a large amount of media may be downloaded. - """ return "vk.com" in url @staticmethod def sanitize_url(url: str) -> str: - # TODO: test method """ Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890' """ - for pattern in [VkDropin.WALL_PATTERN, VkDropin.PHOTO_PATTERN, VkDropin.VIDEO_PATTERN]: + for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.PHOTO_PATTERN]: match = pattern.search(url) if match: return f"https://vk.com/{match.group(1)}" @@ -49,6 +44,7 @@ class VkDropin(Dropin): return True def _login(self) -> bool: + # TODO: test method self.sb.activate_cdp_mode("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url(): @@ -91,8 +87,10 @@ class VkDropin(Dropin): :return: A tuple (number of Images added, number of Videos added). """ - max_videos = self.extractor.max_download_videos - video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')][:max_videos] + video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')] + if type(self.extractor.max_download_videos) is int: + video_urls = video_urls[: self.extractor.max_download_videos] + if not video_urls: return 0, 0 @@ -100,7 +98,7 @@ class VkDropin(Dropin): ydl_options = [ "-o", os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), - # "--quiet", + "--quiet", "--no-playlist", "--no-write-subs", "--no-write-auto-subs", diff --git a/tests/extractors/test_antibot_dropin_vk.py b/tests/extractors/test_antibot_dropin_vk.py new file mode 100644 index 0000000..c39e2a3 --- /dev/null +++ b/tests/extractors/test_antibot_dropin_vk.py @@ -0,0 +1,81 @@ +import pytest +from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin + + +@pytest.mark.parametrize( + "input_url,expected", + [ + # Wall post modal URL + ( + "https://vk.com/somepage?w=wall-123456_7890", + "https://vk.com/wall-123456_7890", + ), + # Wall post modal URL with no dash + ( + "https://vk.com/somepage?w=wall123456_7890", + "https://vk.com/wall123456_7890", + ), + # Photo modal URL + ( + "https://vk.com/somepage?w=photo-654321_9876", + "https://vk.com/photo-654321_9876", + ), + # Photo modal URL with no dash + ( + "https://vk.com/somepage?w=photo654321_9876", + "https://vk.com/photo654321_9876", + ), + # Video modal URL + ( + "https://vk.com/somepage?w=video-111222_3334", + "https://vk.com/video-111222_3334", + ), + # Video modal URL with extra part + ( + "https://vk.com/somepage?w=video-111222_3334_ABC", + "https://vk.com/video-111222_3334_ABC", + ), + # Video modal URL with no dash + ( + "https://vk.com/somepage?w=video111222_3334", + "https://vk.com/video111222_3334", + ), + # No modal, should return unchanged + ( + "https://vk.com/wall-123456_7890", + "https://vk.com/wall-123456_7890", + ), + ( + "https://vk.com/photo-654321_9876", + "https://vk.com/photo-654321_9876", + ), + ( + "https://vk.com/video-111222_3334", + "https://vk.com/video-111222_3334", + ), + # Unrelated URL, should return unchanged + ( + "https://vk.com/id123456", + "https://vk.com/id123456", + ), + ( + "https://example.com/", + "https://example.com/", + ), + # Modal with multiple params, should still work with right priority + ( + "https://vk.com/somepage?z=photo-654321_9876&w=wall-123456_7890", + "https://vk.com/wall-123456_7890", + ), + ( + "https://vk.com/somepage?z=photo-654321_9876&w=video-111222_3334", + "https://vk.com/video-111222_3334", + ), + ( + "https://vk.com/somepage?z=video-111222_3334&w=wall-654321_9876", + "https://vk.com/wall-654321_9876", + ), + ], +) +def test_sanitize_url(input_url, expected): + assert VkDropin.sanitize_url(input_url) == expected diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 3eee3bd..1da025d 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -40,35 +40,46 @@ class TestAntibotExtractorEnricher(TestExtractorBase): @pytest.mark.download @pytest.mark.parametrize( - "url,in_title,image_count,video_count", + "url,in_title,in_text,image_count,video_count", [ ( "https://en.wikipedia.org/wiki/Western_barn_owl", "western barn owl", + "Tyto alba", 5, 0, ), ( "https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/", "open sources show myanmar", + "Bellingcat has geolocated", 5, 0, ), ( "https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/", "shot from above", + "continued the work of Gazan journalists", 5, 1, ), ( "https://www.bellingcat.com/about/general-information", "general information", + "Stichting Bellingcat", 0, # SVGs are ignored 0, ), + ( + "https://vk.com/wikipedia?from=search&w=wall-36156673_20451", + "Hounds of Love", + "16 сентября 1985 года лейблом EMI Records.", + 5, + 0, + ), ], ) - def test_download_pages_with_media(self, setup_module, make_item, url, in_title, image_count, video_count): + def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count): """ Test downloading pages with media. """ @@ -81,7 +92,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "max_download_videos": "inf", }, ) - + url = self.extractor.sanitize_url(url) item = make_item(url) result = self.extractor.download(item) @@ -89,7 +100,14 @@ class TestAntibotExtractorEnricher(TestExtractorBase): # Check title contains all required words (case-insensitive) page_title = result.get_title() or "" - assert in_title in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'" + assert in_title.lower() in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'" + + # Check text contains all required words (case-insensitive) + with open(result.get_media_by_id("html_source_code").filename, "r", encoding="utf-8") as f: + html_content = f.read() + assert in_text.lower() in html_content.lower(), ( + f"Expected HTML to contain '{in_text}', got '{html_content}'" + ) image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"] assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"