adds tests in minor improvements

pull/313/head
msramalho 2025-06-07 19:58:18 +01:00
rodzic 48c1ab3c1f
commit d13a5ef003
Nie znaleziono w bazie danych klucza dla tego podpisu
5 zmienionych plików z 111 dodań i 18 usunięć

Wyświetl plik

@ -93,7 +93,6 @@ class AntibotExtractorEnricher(Extractor, Enricher):
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool: def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
using_user_data_dir = self.user_data_dir if custom_data_dir else None using_user_data_dir = self.user_data_dir if custom_data_dir else None
url = to_enrich.get_url() url = to_enrich.get_url()
# TODO: implement cookies auth = self.auth_for_site(url) and combine with if UrlUtil.is_auth_wall(url) like in ScreenshotEnricher
url_sample = url[:75] url_sample = url[:75]
try: try:

Wyświetl plik

@ -24,7 +24,6 @@ class Dropin:
def suitable(url: str) -> bool: def suitable(url: str) -> bool:
""" """
Check if the URL is suitable for processing with this dropin. Check if the URL is suitable for processing with this dropin.
:param url: The URL to check. :param url: The URL to check.
:return: True if the URL is suitable for processing, False otherwise. :return: True if the URL is suitable for processing, False otherwise.
""" """
@ -33,7 +32,7 @@ class Dropin:
@staticmethod @staticmethod
def sanitize_url(url: str) -> str: def sanitize_url(url: str) -> str:
""" """
Used to clean unnecessary URL parameters OR unfurl redirect links Used to clean URLs before processing them.
""" """
return url return url
@ -48,8 +47,6 @@ class Dropin:
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
""" """
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object. Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
:return: A tuple (number of Images added, number of Videos added). :return: A tuple (number of Images added, number of Videos added).
""" """
raise NotImplementedError("This method should be implemented in the subclass") raise NotImplementedError("This method should be implemented in the subclass")

Wyświetl plik

@ -21,19 +21,14 @@ class VkDropin(Dropin):
@staticmethod @staticmethod
def suitable(url: str) -> bool: def suitable(url: str) -> bool:
"""
Only suitable for VK URLs that match the wall, photo, or video patterns.
Otherwise, for example, for pages a large amount of media may be downloaded.
"""
return "vk.com" in url return "vk.com" in url
@staticmethod @staticmethod
def sanitize_url(url: str) -> str: def sanitize_url(url: str) -> str:
# TODO: test method
""" """
Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890' Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890'
""" """
for pattern in [VkDropin.WALL_PATTERN, VkDropin.PHOTO_PATTERN, VkDropin.VIDEO_PATTERN]: for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.PHOTO_PATTERN]:
match = pattern.search(url) match = pattern.search(url)
if match: if match:
return f"https://vk.com/{match.group(1)}" return f"https://vk.com/{match.group(1)}"
@ -49,6 +44,7 @@ class VkDropin(Dropin):
return True return True
def _login(self) -> bool: def _login(self) -> bool:
# TODO: test method
self.sb.activate_cdp_mode("https://vk.com") self.sb.activate_cdp_mode("https://vk.com")
self.sb.wait_for_ready_state_complete() self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url(): if "/feed" in self.sb.get_current_url():
@ -91,8 +87,10 @@ class VkDropin(Dropin):
:return: A tuple (number of Images added, number of Videos added). :return: A tuple (number of Images added, number of Videos added).
""" """
max_videos = self.extractor.max_download_videos video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')]
video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')][:max_videos] if type(self.extractor.max_download_videos) is int:
video_urls = video_urls[: self.extractor.max_download_videos]
if not video_urls: if not video_urls:
return 0, 0 return 0, 0
@ -100,7 +98,7 @@ class VkDropin(Dropin):
ydl_options = [ ydl_options = [
"-o", "-o",
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
# "--quiet", "--quiet",
"--no-playlist", "--no-playlist",
"--no-write-subs", "--no-write-subs",
"--no-write-auto-subs", "--no-write-auto-subs",

Wyświetl plik

@ -0,0 +1,81 @@
import pytest
from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin
@pytest.mark.parametrize(
"input_url,expected",
[
# Wall post modal URL
(
"https://vk.com/somepage?w=wall-123456_7890",
"https://vk.com/wall-123456_7890",
),
# Wall post modal URL with no dash
(
"https://vk.com/somepage?w=wall123456_7890",
"https://vk.com/wall123456_7890",
),
# Photo modal URL
(
"https://vk.com/somepage?w=photo-654321_9876",
"https://vk.com/photo-654321_9876",
),
# Photo modal URL with no dash
(
"https://vk.com/somepage?w=photo654321_9876",
"https://vk.com/photo654321_9876",
),
# Video modal URL
(
"https://vk.com/somepage?w=video-111222_3334",
"https://vk.com/video-111222_3334",
),
# Video modal URL with extra part
(
"https://vk.com/somepage?w=video-111222_3334_ABC",
"https://vk.com/video-111222_3334_ABC",
),
# Video modal URL with no dash
(
"https://vk.com/somepage?w=video111222_3334",
"https://vk.com/video111222_3334",
),
# No modal, should return unchanged
(
"https://vk.com/wall-123456_7890",
"https://vk.com/wall-123456_7890",
),
(
"https://vk.com/photo-654321_9876",
"https://vk.com/photo-654321_9876",
),
(
"https://vk.com/video-111222_3334",
"https://vk.com/video-111222_3334",
),
# Unrelated URL, should return unchanged
(
"https://vk.com/id123456",
"https://vk.com/id123456",
),
(
"https://example.com/",
"https://example.com/",
),
# Modal with multiple params, should still work with right priority
(
"https://vk.com/somepage?z=photo-654321_9876&w=wall-123456_7890",
"https://vk.com/wall-123456_7890",
),
(
"https://vk.com/somepage?z=photo-654321_9876&w=video-111222_3334",
"https://vk.com/video-111222_3334",
),
(
"https://vk.com/somepage?z=video-111222_3334&w=wall-654321_9876",
"https://vk.com/wall-654321_9876",
),
],
)
def test_sanitize_url(input_url, expected):
assert VkDropin.sanitize_url(input_url) == expected

Wyświetl plik

@ -40,35 +40,46 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
@pytest.mark.download @pytest.mark.download
@pytest.mark.parametrize( @pytest.mark.parametrize(
"url,in_title,image_count,video_count", "url,in_title,in_text,image_count,video_count",
[ [
( (
"https://en.wikipedia.org/wiki/Western_barn_owl", "https://en.wikipedia.org/wiki/Western_barn_owl",
"western barn owl", "western barn owl",
"Tyto alba",
5, 5,
0, 0,
), ),
( (
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/", "https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
"open sources show myanmar", "open sources show myanmar",
"Bellingcat has geolocated",
5, 5,
0, 0,
), ),
( (
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/", "https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
"shot from above", "shot from above",
"continued the work of Gazan journalists",
5, 5,
1, 1,
), ),
( (
"https://www.bellingcat.com/about/general-information", "https://www.bellingcat.com/about/general-information",
"general information", "general information",
"Stichting Bellingcat",
0, # SVGs are ignored 0, # SVGs are ignored
0, 0,
), ),
(
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
"Hounds of Love",
"16 сентября 1985 года лейблом EMI Records.",
5,
0,
),
], ],
) )
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, image_count, video_count): def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
""" """
Test downloading pages with media. Test downloading pages with media.
""" """
@ -81,7 +92,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
"max_download_videos": "inf", "max_download_videos": "inf",
}, },
) )
url = self.extractor.sanitize_url(url)
item = make_item(url) item = make_item(url)
result = self.extractor.download(item) result = self.extractor.download(item)
@ -89,7 +100,14 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
# Check title contains all required words (case-insensitive) # Check title contains all required words (case-insensitive)
page_title = result.get_title() or "" page_title = result.get_title() or ""
assert in_title in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'" assert in_title.lower() in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
# Check text contains all required words (case-insensitive)
with open(result.get_media_by_id("html_source_code").filename, "r", encoding="utf-8") as f:
html_content = f.read()
assert in_text.lower() in html_content.lower(), (
f"Expected HTML to contain '{in_text}', got '{html_content}'"
)
image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"] image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"]
assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}" assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"