adds clips extraction to VK, though generic_extractor should still be run for those

pull/313/head
msramalho 2025-06-08 14:36:55 +01:00
rodzic 1f2d637928
commit 6f02493ff1
Nie znaleziono w bazie danych klucza dla tego podpisu
2 zmienionych plików z 30 dodań i 12 usunięć

Wyświetl plik

@ -16,8 +16,9 @@ class VkDropin(Dropin):
"""
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
@staticmethod
def suitable(url: str) -> bool:
@ -28,24 +29,21 @@ class VkDropin(Dropin):
"""
Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890'
"""
for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.PHOTO_PATTERN]:
for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.CLIP_PATTERN, VkDropin.PHOTO_PATTERN]:
match = pattern.search(url)
if match:
return f"https://vk.com/{match.group(1)}"
return url
def open_page(self, url) -> bool:
logger.debug("Checking if authenticated for VK...")
if self.sb.get_current_url() != url or self.sb.is_text_visible("Sign in to VK"):
logger.info("Opening VK page: {}", url)
if self.sb.is_text_visible("Sign in to VK"):
self._login()
self.sb.open(url)
logger.debug("VK page opened successfully.")
return True
def _login(self) -> bool:
# TODO: test method
self.sb.activate_cdp_mode("https://vk.com")
self.sb.open("https://vk.com")
self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url():
logger.debug("Already logged in to VK.")

Wyświetl plik

@ -5,6 +5,15 @@ from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin
@pytest.mark.parametrize(
"input_url,expected",
[
# Unrelated URL, should return unchanged
(
"https://vk.com/id123456",
"https://vk.com/id123456",
),
(
"https://example.com/",
"https://example.com/",
),
# Wall post modal URL
(
"https://vk.com/somepage?w=wall-123456_7890",
@ -53,14 +62,25 @@ from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin
"https://vk.com/video-111222_3334",
"https://vk.com/video-111222_3334",
),
# Unrelated URL, should return unchanged
# Clip modal URL
(
"https://vk.com/id123456",
"https://vk.com/id123456",
"https://vk.com/somepage?w=clip-555666_7778",
"https://vk.com/clip-555666_7778",
),
# Clip modal URL with no dash
(
"https://example.com/",
"https://example.com/",
"https://vk.com/somepage?w=clip555666_7778",
"https://vk.com/clip555666_7778",
),
# Clip modal URL with extra part
(
"https://vk.com/somepage?w=clip-555666_7778_ABC",
"https://vk.com/clip-555666_7778",
),
# No modal, should return unchanged (clip)
(
"https://vk.com/clip-555666_7778",
"https://vk.com/clip-555666_7778",
),
# Modal with multiple params, should still work with right priority
(