From d13a5ef00392452658e8799686ea902d18b683ff Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Sat, 7 Jun 2025 19:58:18 +0100
Subject: [PATCH] adds tests in minor improvements

---
 .../antibot_extractor_enricher.py             |  1 -
 .../antibot_extractor_enricher/dropin.py      |  5 +-
 .../antibot_extractor_enricher/dropins/vk.py  | 16 ++--
 tests/extractors/test_antibot_dropin_vk.py    | 81 +++++++++++++++++++
 .../test_antibot_extractor_enricher.py        | 26 +++++-
 5 files changed, 111 insertions(+), 18 deletions(-)
 create mode 100644 tests/extractors/test_antibot_dropin_vk.py

diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
index 0401468..c8dc137 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@@ -93,7 +93,6 @@ class AntibotExtractorEnricher(Extractor, Enricher):
     def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
         using_user_data_dir = self.user_data_dir if custom_data_dir else None
         url = to_enrich.get_url()
-        # TODO: implement cookies auth = self.auth_for_site(url) and combine with if UrlUtil.is_auth_wall(url) like in ScreenshotEnricher
         url_sample = url[:75]
 
         try:
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
index 39e34a9..805edfd 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@@ -24,7 +24,6 @@ class Dropin:
     def suitable(url: str) -> bool:
         """
         Check if the URL is suitable for processing with this dropin.
-
         :param url: The URL to check.
         :return: True if the URL is suitable for processing, False otherwise.
         """
@@ -33,7 +32,7 @@ class Dropin:
     @staticmethod
     def sanitize_url(url: str) -> str:
         """
-        Used to clean unnecessary URL parameters OR unfurl redirect links
+        Used to clean URLs before processing them.
         """
         return url
 
@@ -48,8 +47,6 @@ class Dropin:
     def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
         """
         Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
-
-
         :return: A tuple (number of Images added, number of Videos added).
         """
         raise NotImplementedError("This method should be implemented in the subclass")
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
index 9f33239..b36b517 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
@@ -21,19 +21,14 @@ class VkDropin(Dropin):
 
     @staticmethod
     def suitable(url: str) -> bool:
-        """
-        Only suitable for VK URLs that match the wall, photo, or video patterns.
-        Otherwise, for example, for pages a large amount of media may be downloaded.
-        """
         return "vk.com" in url
 
     @staticmethod
     def sanitize_url(url: str) -> str:
-        # TODO: test method
         """
         Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890'
         """
-        for pattern in [VkDropin.WALL_PATTERN, VkDropin.PHOTO_PATTERN, VkDropin.VIDEO_PATTERN]:
+        for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.PHOTO_PATTERN]:
             match = pattern.search(url)
             if match:
                 return f"https://vk.com/{match.group(1)}"
@@ -49,6 +44,7 @@ class VkDropin(Dropin):
         return True
 
     def _login(self) -> bool:
+        # TODO: test method
         self.sb.activate_cdp_mode("https://vk.com")
         self.sb.wait_for_ready_state_complete()
         if "/feed" in self.sb.get_current_url():
@@ -91,8 +87,10 @@ class VkDropin(Dropin):
 
         :return: A tuple (number of Images added, number of Videos added).
         """
-        max_videos = self.extractor.max_download_videos
-        video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')][:max_videos]
+        video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')]
+        if type(self.extractor.max_download_videos) is int:
+            video_urls = video_urls[: self.extractor.max_download_videos]
+
         if not video_urls:
             return 0, 0
 
@@ -100,7 +98,7 @@ class VkDropin(Dropin):
         ydl_options = [
             "-o",
             os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
-            # "--quiet",
+            "--quiet",
             "--no-playlist",
             "--no-write-subs",
             "--no-write-auto-subs",
diff --git a/tests/extractors/test_antibot_dropin_vk.py b/tests/extractors/test_antibot_dropin_vk.py
new file mode 100644
index 0000000..c39e2a3
--- /dev/null
+++ b/tests/extractors/test_antibot_dropin_vk.py
@@ -0,0 +1,81 @@
+import pytest
+from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin
+
+
+@pytest.mark.parametrize(
+    "input_url,expected",
+    [
+        # Wall post modal URL
+        (
+            "https://vk.com/somepage?w=wall-123456_7890",
+            "https://vk.com/wall-123456_7890",
+        ),
+        # Wall post modal URL with no dash
+        (
+            "https://vk.com/somepage?w=wall123456_7890",
+            "https://vk.com/wall123456_7890",
+        ),
+        # Photo modal URL
+        (
+            "https://vk.com/somepage?w=photo-654321_9876",
+            "https://vk.com/photo-654321_9876",
+        ),
+        # Photo modal URL with no dash
+        (
+            "https://vk.com/somepage?w=photo654321_9876",
+            "https://vk.com/photo654321_9876",
+        ),
+        # Video modal URL
+        (
+            "https://vk.com/somepage?w=video-111222_3334",
+            "https://vk.com/video-111222_3334",
+        ),
+        # Video modal URL with extra part
+        (
+            "https://vk.com/somepage?w=video-111222_3334_ABC",
+            "https://vk.com/video-111222_3334_ABC",
+        ),
+        # Video modal URL with no dash
+        (
+            "https://vk.com/somepage?w=video111222_3334",
+            "https://vk.com/video111222_3334",
+        ),
+        # No modal, should return unchanged
+        (
+            "https://vk.com/wall-123456_7890",
+            "https://vk.com/wall-123456_7890",
+        ),
+        (
+            "https://vk.com/photo-654321_9876",
+            "https://vk.com/photo-654321_9876",
+        ),
+        (
+            "https://vk.com/video-111222_3334",
+            "https://vk.com/video-111222_3334",
+        ),
+        # Unrelated URL, should return unchanged
+        (
+            "https://vk.com/id123456",
+            "https://vk.com/id123456",
+        ),
+        (
+            "https://example.com/",
+            "https://example.com/",
+        ),
+        # Modal with multiple params, should still work with right priority
+        (
+            "https://vk.com/somepage?z=photo-654321_9876&w=wall-123456_7890",
+            "https://vk.com/wall-123456_7890",
+        ),
+        (
+            "https://vk.com/somepage?z=photo-654321_9876&w=video-111222_3334",
+            "https://vk.com/video-111222_3334",
+        ),
+        (
+            "https://vk.com/somepage?z=video-111222_3334&w=wall-654321_9876",
+            "https://vk.com/wall-654321_9876",
+        ),
+    ],
+)
+def test_sanitize_url(input_url, expected):
+    assert VkDropin.sanitize_url(input_url) == expected
diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py
index 3eee3bd..1da025d 100644
--- a/tests/extractors/test_antibot_extractor_enricher.py
+++ b/tests/extractors/test_antibot_extractor_enricher.py
@@ -40,35 +40,46 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
 
     @pytest.mark.download
     @pytest.mark.parametrize(
-        "url,in_title,image_count,video_count",
+        "url,in_title,in_text,image_count,video_count",
         [
             (
                 "https://en.wikipedia.org/wiki/Western_barn_owl",
                 "western barn owl",
+                "Tyto alba",
                 5,
                 0,
             ),
             (
                 "https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
                 "open sources show myanmar",
+                "Bellingcat has geolocated",
                 5,
                 0,
             ),
             (
                 "https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
                 "shot from above",
+                "continued the work of Gazan journalists",
                 5,
                 1,
             ),
             (
                 "https://www.bellingcat.com/about/general-information",
                 "general information",
+                "Stichting Bellingcat",
                 0,  # SVGs are ignored
                 0,
             ),
+            (
+                "https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
+                "Hounds of Love",
+                "16 сентября 1985 года лейблом EMI Records.",
+                5,
+                0,
+            ),
         ],
     )
-    def test_download_pages_with_media(self, setup_module, make_item, url, in_title, image_count, video_count):
+    def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
         """
         Test downloading pages with media.
         """
@@ -81,7 +92,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
                 "max_download_videos": "inf",
             },
         )
-
+        url = self.extractor.sanitize_url(url)
         item = make_item(url)
         result = self.extractor.download(item)
 
@@ -89,7 +100,14 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
 
         # Check title contains all required words (case-insensitive)
         page_title = result.get_title() or ""
-        assert in_title in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
+        assert in_title.lower() in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
+
+        # Check text contains all required words (case-insensitive)
+        with open(result.get_media_by_id("html_source_code").filename, "r", encoding="utf-8") as f:
+            html_content = f.read()
+            assert in_text.lower() in html_content.lower(), (
+                f"Expected HTML to contain '{in_text}', got '{html_content}'"
+            )
 
         image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"]
         assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"