kopia lustrzana https://github.com/bellingcat/auto-archiver
adds tests in minor improvements
rodzic
48c1ab3c1f
commit
d13a5ef003
|
@ -93,7 +93,6 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
|
||||
using_user_data_dir = self.user_data_dir if custom_data_dir else None
|
||||
url = to_enrich.get_url()
|
||||
# TODO: implement cookies auth = self.auth_for_site(url) and combine with if UrlUtil.is_auth_wall(url) like in ScreenshotEnricher
|
||||
url_sample = url[:75]
|
||||
|
||||
try:
|
||||
|
|
|
@ -24,7 +24,6 @@ class Dropin:
|
|||
def suitable(url: str) -> bool:
|
||||
"""
|
||||
Check if the URL is suitable for processing with this dropin.
|
||||
|
||||
:param url: The URL to check.
|
||||
:return: True if the URL is suitable for processing, False otherwise.
|
||||
"""
|
||||
|
@ -33,7 +32,7 @@ class Dropin:
|
|||
@staticmethod
|
||||
def sanitize_url(url: str) -> str:
|
||||
"""
|
||||
Used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
Used to clean URLs before processing them.
|
||||
"""
|
||||
return url
|
||||
|
||||
|
@ -48,8 +47,6 @@ class Dropin:
|
|||
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
|
||||
"""
|
||||
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
|
||||
|
||||
|
||||
:return: A tuple (number of Images added, number of Videos added).
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
|
|
|
@ -21,19 +21,14 @@ class VkDropin(Dropin):
|
|||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
"""
|
||||
Only suitable for VK URLs that match the wall, photo, or video patterns.
|
||||
Otherwise, for example, for pages a large amount of media may be downloaded.
|
||||
"""
|
||||
return "vk.com" in url
|
||||
|
||||
@staticmethod
|
||||
def sanitize_url(url: str) -> str:
|
||||
# TODO: test method
|
||||
"""
|
||||
Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890'
|
||||
"""
|
||||
for pattern in [VkDropin.WALL_PATTERN, VkDropin.PHOTO_PATTERN, VkDropin.VIDEO_PATTERN]:
|
||||
for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.PHOTO_PATTERN]:
|
||||
match = pattern.search(url)
|
||||
if match:
|
||||
return f"https://vk.com/{match.group(1)}"
|
||||
|
@ -49,6 +44,7 @@ class VkDropin(Dropin):
|
|||
return True
|
||||
|
||||
def _login(self) -> bool:
|
||||
# TODO: test method
|
||||
self.sb.activate_cdp_mode("https://vk.com")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
if "/feed" in self.sb.get_current_url():
|
||||
|
@ -91,8 +87,10 @@ class VkDropin(Dropin):
|
|||
|
||||
:return: A tuple (number of Images added, number of Videos added).
|
||||
"""
|
||||
max_videos = self.extractor.max_download_videos
|
||||
video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')][:max_videos]
|
||||
video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')]
|
||||
if type(self.extractor.max_download_videos) is int:
|
||||
video_urls = video_urls[: self.extractor.max_download_videos]
|
||||
|
||||
if not video_urls:
|
||||
return 0, 0
|
||||
|
||||
|
@ -100,7 +98,7 @@ class VkDropin(Dropin):
|
|||
ydl_options = [
|
||||
"-o",
|
||||
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
|
||||
# "--quiet",
|
||||
"--quiet",
|
||||
"--no-playlist",
|
||||
"--no-write-subs",
|
||||
"--no-write-auto-subs",
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
import pytest
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_url,expected",
|
||||
[
|
||||
# Wall post modal URL
|
||||
(
|
||||
"https://vk.com/somepage?w=wall-123456_7890",
|
||||
"https://vk.com/wall-123456_7890",
|
||||
),
|
||||
# Wall post modal URL with no dash
|
||||
(
|
||||
"https://vk.com/somepage?w=wall123456_7890",
|
||||
"https://vk.com/wall123456_7890",
|
||||
),
|
||||
# Photo modal URL
|
||||
(
|
||||
"https://vk.com/somepage?w=photo-654321_9876",
|
||||
"https://vk.com/photo-654321_9876",
|
||||
),
|
||||
# Photo modal URL with no dash
|
||||
(
|
||||
"https://vk.com/somepage?w=photo654321_9876",
|
||||
"https://vk.com/photo654321_9876",
|
||||
),
|
||||
# Video modal URL
|
||||
(
|
||||
"https://vk.com/somepage?w=video-111222_3334",
|
||||
"https://vk.com/video-111222_3334",
|
||||
),
|
||||
# Video modal URL with extra part
|
||||
(
|
||||
"https://vk.com/somepage?w=video-111222_3334_ABC",
|
||||
"https://vk.com/video-111222_3334_ABC",
|
||||
),
|
||||
# Video modal URL with no dash
|
||||
(
|
||||
"https://vk.com/somepage?w=video111222_3334",
|
||||
"https://vk.com/video111222_3334",
|
||||
),
|
||||
# No modal, should return unchanged
|
||||
(
|
||||
"https://vk.com/wall-123456_7890",
|
||||
"https://vk.com/wall-123456_7890",
|
||||
),
|
||||
(
|
||||
"https://vk.com/photo-654321_9876",
|
||||
"https://vk.com/photo-654321_9876",
|
||||
),
|
||||
(
|
||||
"https://vk.com/video-111222_3334",
|
||||
"https://vk.com/video-111222_3334",
|
||||
),
|
||||
# Unrelated URL, should return unchanged
|
||||
(
|
||||
"https://vk.com/id123456",
|
||||
"https://vk.com/id123456",
|
||||
),
|
||||
(
|
||||
"https://example.com/",
|
||||
"https://example.com/",
|
||||
),
|
||||
# Modal with multiple params, should still work with right priority
|
||||
(
|
||||
"https://vk.com/somepage?z=photo-654321_9876&w=wall-123456_7890",
|
||||
"https://vk.com/wall-123456_7890",
|
||||
),
|
||||
(
|
||||
"https://vk.com/somepage?z=photo-654321_9876&w=video-111222_3334",
|
||||
"https://vk.com/video-111222_3334",
|
||||
),
|
||||
(
|
||||
"https://vk.com/somepage?z=video-111222_3334&w=wall-654321_9876",
|
||||
"https://vk.com/wall-654321_9876",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_sanitize_url(input_url, expected):
|
||||
assert VkDropin.sanitize_url(input_url) == expected
|
|
@ -40,35 +40,46 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_title,image_count,video_count",
|
||||
"url,in_title,in_text,image_count,video_count",
|
||||
[
|
||||
(
|
||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||
"western barn owl",
|
||||
"Tyto alba",
|
||||
5,
|
||||
0,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||
"open sources show myanmar",
|
||||
"Bellingcat has geolocated",
|
||||
5,
|
||||
0,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||
"shot from above",
|
||||
"continued the work of Gazan journalists",
|
||||
5,
|
||||
1,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/about/general-information",
|
||||
"general information",
|
||||
"Stichting Bellingcat",
|
||||
0, # SVGs are ignored
|
||||
0,
|
||||
),
|
||||
(
|
||||
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
||||
"Hounds of Love",
|
||||
"16 сентября 1985 года лейблом EMI Records.",
|
||||
5,
|
||||
0,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, image_count, video_count):
|
||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
|
||||
"""
|
||||
Test downloading pages with media.
|
||||
"""
|
||||
|
@ -81,7 +92,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||
"max_download_videos": "inf",
|
||||
},
|
||||
)
|
||||
|
||||
url = self.extractor.sanitize_url(url)
|
||||
item = make_item(url)
|
||||
result = self.extractor.download(item)
|
||||
|
||||
|
@ -89,7 +100,14 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||
|
||||
# Check title contains all required words (case-insensitive)
|
||||
page_title = result.get_title() or ""
|
||||
assert in_title in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
|
||||
assert in_title.lower() in page_title.lower(), f"Expected title to contain '{in_title}', got '{page_title}'"
|
||||
|
||||
# Check text contains all required words (case-insensitive)
|
||||
with open(result.get_media_by_id("html_source_code").filename, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
assert in_text.lower() in html_content.lower(), (
|
||||
f"Expected HTML to contain '{in_text}', got '{html_content}'"
|
||||
)
|
||||
|
||||
image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"]
|
||||
assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"
|
||||
|
|
Ładowanie…
Reference in New Issue