kopia lustrzana https://github.com/bellingcat/auto-archiver
adds clips extraction to VK, though generic_extractor should still be run for those
rodzic
1f2d637928
commit
6f02493ff1
|
@ -16,8 +16,9 @@ class VkDropin(Dropin):
|
|||
"""
|
||||
|
||||
WALL_PATTERN = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
VIDEO_PATTERN = re.compile(r"(video.{0,1}\d+_\d+(?:_\w+)?)")
|
||||
CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
|
||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
|
@ -28,24 +29,21 @@ class VkDropin(Dropin):
|
|||
"""
|
||||
Transforms modal URLs like 'https://vk.com/page_name?w=wall-123456_7890' to 'https://vk.com/wall-123456_7890'
|
||||
"""
|
||||
for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.PHOTO_PATTERN]:
|
||||
for pattern in [VkDropin.WALL_PATTERN, VkDropin.VIDEO_PATTERN, VkDropin.CLIP_PATTERN, VkDropin.PHOTO_PATTERN]:
|
||||
match = pattern.search(url)
|
||||
if match:
|
||||
return f"https://vk.com/{match.group(1)}"
|
||||
return url
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
logger.debug("Checking if authenticated for VK...")
|
||||
if self.sb.get_current_url() != url or self.sb.is_text_visible("Sign in to VK"):
|
||||
logger.info("Opening VK page: {}", url)
|
||||
if self.sb.is_text_visible("Sign in to VK"):
|
||||
self._login()
|
||||
self.sb.open(url)
|
||||
logger.debug("VK page opened successfully.")
|
||||
return True
|
||||
|
||||
def _login(self) -> bool:
|
||||
# TODO: test method
|
||||
self.sb.activate_cdp_mode("https://vk.com")
|
||||
self.sb.open("https://vk.com")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
if "/feed" in self.sb.get_current_url():
|
||||
logger.debug("Already logged in to VK.")
|
||||
|
|
|
@ -5,6 +5,15 @@ from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin
|
|||
@pytest.mark.parametrize(
|
||||
"input_url,expected",
|
||||
[
|
||||
# Unrelated URL, should return unchanged
|
||||
(
|
||||
"https://vk.com/id123456",
|
||||
"https://vk.com/id123456",
|
||||
),
|
||||
(
|
||||
"https://example.com/",
|
||||
"https://example.com/",
|
||||
),
|
||||
# Wall post modal URL
|
||||
(
|
||||
"https://vk.com/somepage?w=wall-123456_7890",
|
||||
|
@ -53,14 +62,25 @@ from auto_archiver.modules.antibot_extractor_enricher.dropins.vk import VkDropin
|
|||
"https://vk.com/video-111222_3334",
|
||||
"https://vk.com/video-111222_3334",
|
||||
),
|
||||
# Unrelated URL, should return unchanged
|
||||
# Clip modal URL
|
||||
(
|
||||
"https://vk.com/id123456",
|
||||
"https://vk.com/id123456",
|
||||
"https://vk.com/somepage?w=clip-555666_7778",
|
||||
"https://vk.com/clip-555666_7778",
|
||||
),
|
||||
# Clip modal URL with no dash
|
||||
(
|
||||
"https://example.com/",
|
||||
"https://example.com/",
|
||||
"https://vk.com/somepage?w=clip555666_7778",
|
||||
"https://vk.com/clip555666_7778",
|
||||
),
|
||||
# Clip modal URL with extra part
|
||||
(
|
||||
"https://vk.com/somepage?w=clip-555666_7778_ABC",
|
||||
"https://vk.com/clip-555666_7778",
|
||||
),
|
||||
# No modal, should return unchanged (clip)
|
||||
(
|
||||
"https://vk.com/clip-555666_7778",
|
||||
"https://vk.com/clip-555666_7778",
|
||||
),
|
||||
# Modal with multiple params, should still work with right priority
|
||||
(
|
||||
|
|
Ładowanie…
Reference in New Issue