From 6f02493ff1aa9ae3f738bde945161d9096ab5dfa Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sun, 8 Jun 2025 14:36:55 +0100 Subject: [PATCH] adds clips extraction to VK, though generic_extractor should still be run for those --- .../antibot_extractor_enricher/dropins/vk.py | 12 ++++---- tests/extractors/test_antibot_dropin_vk.py | 30 +++++++++++++++---- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index b36b517..6f54187 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -16,8 +16,9 @@ class VkDropin(Dropin): """ WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)") - PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)") + CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)") + PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") @staticmethod def suitable(url: str) -> bool: @@ -28,24 +29,21 @@ class VkDropin(Dropin): """ Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890' """ - for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.PHOTO_PATTERN]: + for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.CLIP_PATTERN, VkDropin.PHOTO_PATTERN]: match = pattern.search(url) if match: return f"https://vk.com/{match.group(1)}" return url def open_page(self, url) -> bool: - logger.debug("Checking if authenticated for VK...") - if self.sb.get_current_url() != url or self.sb.is_text_visible("Sign in to VK"): - logger.info("Opening VK page: {}", url) + if self.sb.is_text_visible("Sign in to VK"): self._login() self.sb.open(url) - logger.debug("VK page opened successfully.") return True def _login(self) -> bool: # TODO: test method - self.sb.activate_cdp_mode("https://vk.com") + self.sb.open("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url(): logger.debug("Already logged in to VK.") diff --git a/tests/extractors/test_antibot_dropin_vk.py b/tests/extractors/test_antibot_dropin_vk.py index c39e2a3..8b3d9c2 100644 --- a/tests/extractors/test_antibot_dropin_vk.py +++ b/tests/extractors/test_antibot_dropin_vk.py @@ -5,6 +5,15 @@ from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin @pytest.mark.parametrize( "input_url,expected", [ + # Unrelated URL, should return unchanged + ( + "https://vk.com/id123456", + "https://vk.com/id123456", + ), + ( + "https://example.com/", + "https://example.com/", + ), # Wall post modal URL ( "https://vk.com/somepage?w=wall-123456_7890", @@ -53,14 +62,25 @@ from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin "https://vk.com/video-111222_3334", "https://vk.com/video-111222_3334", ), - # Unrelated URL, should return unchanged + # Clip modal URL ( - "https://vk.com/id123456", - "https://vk.com/id123456", + "https://vk.com/somepage?w=clip-555666_7778", + "https://vk.com/clip-555666_7778", ), + # Clip modal URL with no dash ( - "https://example.com/", - "https://example.com/", + "https://vk.com/somepage?w=clip555666_7778", + "https://vk.com/clip555666_7778", + ), + # Clip modal URL with extra part + ( + "https://vk.com/somepage?w=clip-555666_7778_ABC", + "https://vk.com/clip-555666_7778", + ), + # No modal, should return unchanged (clip) + ( + "https://vk.com/clip-555666_7778", + "https://vk.com/clip-555666_7778", ), # Modal with multiple params, should still work with right priority (