kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge pull request #319 from bellingcat/feat/linkedin-antibot
Antibot Dropin for Linkedinpull/325/head
commit
3d31c7605b
|
@ -1,6 +1,7 @@
|
|||
tmp*/
|
||||
temp/
|
||||
.env*
|
||||
!.env*.example
|
||||
.DS_Store
|
||||
expmt/
|
||||
service_account.json
|
||||
|
@ -37,4 +38,5 @@ docs/source/modules/autogen/
|
|||
scripts/settings_page.html
|
||||
scripts/settings/src/schema.json
|
||||
.vite
|
||||
downloaded_files
|
||||
downloaded_files
|
||||
latest_logs
|
|
@ -47,7 +47,6 @@ def generate_module_docs():
|
|||
|
||||
for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
|
||||
# generate the markdown file from the __manifest__.py file.
|
||||
|
||||
manifest = module.manifest
|
||||
for type in manifest["type"]:
|
||||
modules_by_type.setdefault(type, []).append(module)
|
||||
|
@ -64,6 +63,27 @@ def generate_module_docs():
|
|||
"""
|
||||
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"])
|
||||
|
||||
if manifest.get("autodoc_dropins"):
|
||||
loaded_module = module.load({})
|
||||
dropins = loaded_module.load_dropins()
|
||||
dropin_str = "\n##### Available Dropins\n"
|
||||
for dropin in dropins:
|
||||
if not (ddoc := dropin.documentation()):
|
||||
continue
|
||||
dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n"
|
||||
dropin_str += f"{ddoc.get('description')}\n\n"
|
||||
if ddoc.get("site"):
|
||||
dropin_str += f"**Site**: {ddoc['site']}\n\n"
|
||||
if dauth := ddoc.get("authentication"):
|
||||
dropin_str += "**YAML configuration**:\n"
|
||||
dropin_auth_yaml = "authentication:\n...\n"
|
||||
for site, creds in dauth.items():
|
||||
dropin_auth_yaml += f" {site}:\n"
|
||||
for k, v in creds.items():
|
||||
dropin_auth_yaml += f' {k}: "{v}"\n'
|
||||
dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n"
|
||||
readme_str += dropin_str
|
||||
|
||||
if not manifest["configs"]:
|
||||
config_string = f"# No configuration options for {module.name}.*\n"
|
||||
else:
|
||||
|
|
|
@ -3,14 +3,14 @@
|
|||
`pytest` is used for testing. There are two main types of tests:
|
||||
|
||||
1. 'core' tests which should be run on every change
|
||||
2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed.
|
||||
2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed, they take longer.
|
||||
|
||||
|
||||
## Running Tests
|
||||
|
||||
1. Make sure you've installed the dev dependencies with `pytest install --with dev`
|
||||
2. Tests can be run as follows:
|
||||
```
|
||||
```{code} bash
|
||||
#### Command prefix of 'poetry run' removed here for simplicity
|
||||
# run core tests
|
||||
pytest -ra -v -m "not download"
|
||||
|
@ -18,4 +18,15 @@ pytest -ra -v -m "not download"
|
|||
pytest -ra -v -m "download"
|
||||
# run all tests
|
||||
pytest -ra -v
|
||||
|
||||
|
||||
# run a specific test file
|
||||
pytest -ra -v tests/test_file.py
|
||||
# run a specific test function
|
||||
pytest -ra -v tests/test_file.py::test_function_name
|
||||
```
|
||||
|
||||
3. Some tests require environment variables to be set. You can use the example `.env.test.example` file as a template. Copy it to `.env.test` and fill in the required values. This file will be loaded automatically by `pytest`.
|
||||
```{code} bash
|
||||
cp .env.test.example .env.test
|
||||
```
|
|
@ -15,19 +15,29 @@ We have dropped the `vk_extractor` because of problems in a project we relied on
|
|||
Module 'vk_extractor' not found. Are you sure it's installed/exists?
|
||||
```
|
||||
|
||||
## Dropping `screenshot_enricher` module
|
||||
We have dropped the `screenshot_enricher` module because a new `antibot_extractor_enricher` (see below) module replaces its functionality more robustly and with less dependency hassle on geckodriver/firefox. You will need to remove it from your configuration file, otherwise you will see an error like:
|
||||
|
||||
```{code} console
|
||||
Module 'screenshot_enricher' not found. Are you sure it's installed/exists?
|
||||
```
|
||||
|
||||
|
||||
## New `antibot_extractor_enricher` module and VkDropin
|
||||
We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:
|
||||
We have added a new [`antibot_extractor_enricher`](../modules/autogen/extractor/antibot_extractor_enricher.md) module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:
|
||||
|
||||
```{code} yaml
|
||||
steps:
|
||||
extractors:
|
||||
- antibot_extractor_enricher
|
||||
extractors:
|
||||
- antibot_extractor_enricher
|
||||
|
||||
# or alternatively, if you want to use it as an enricher:
|
||||
enrichers:
|
||||
- antibot_extractor_enricher
|
||||
# or alternatively, if you want to use it as an enricher:
|
||||
enrichers:
|
||||
- antibot_extractor_enricher
|
||||
```
|
||||
|
||||
It will take a full page screenshot, a PDF capture, extract HTML source code, and any other relevant media.
|
||||
|
||||
It comes with Dropins that we will be adding and maintaining.
|
||||
|
||||
> Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though.
|
||||
|
@ -36,9 +46,9 @@ One such Dropin is the VkDropin which uses this automated browser to access VKon
|
|||
|
||||
```{code} yaml
|
||||
authentication:
|
||||
vk:
|
||||
username: your_username
|
||||
password: your_password
|
||||
vk.com:
|
||||
username: your_username
|
||||
password: your_password
|
||||
```
|
||||
|
||||
See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin.
|
|
@ -77,6 +77,8 @@ class Extractor(BaseModule):
|
|||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||
Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful.
|
||||
"""
|
||||
if any(url.startswith(x) for x in ["blob:", "data:"]):
|
||||
return None, url if try_best_quality else None
|
||||
|
||||
if try_best_quality:
|
||||
with suppress(Exception):
|
||||
|
@ -116,6 +118,8 @@ class Extractor(BaseModule):
|
|||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
|
||||
if try_best_quality:
|
||||
return None, url
|
||||
|
||||
@abstractmethod
|
||||
def download(self, item: Metadata) -> Metadata | False:
|
||||
|
|
|
@ -34,7 +34,7 @@ from .config import (
|
|||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES, SetupError
|
||||
from auto_archiver.utils.url import check_url_or_raise
|
||||
from auto_archiver.utils.url import check_url_or_raise, clean
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
|
@ -572,7 +572,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
raise e
|
||||
|
||||
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
|
||||
url = original_url
|
||||
url = clean(original_url)
|
||||
for a in self.extractors:
|
||||
url = a.sanitize_url(url)
|
||||
|
||||
|
|
|
@ -31,11 +31,12 @@
|
|||
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
|
||||
},
|
||||
},
|
||||
"autodoc_dropins": True,
|
||||
"description": """
|
||||
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
|
||||
|
||||
Still in trial development, please report any issues or suggestions via GitHub Issues.
|
||||
|
||||
> ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues).
|
||||
|
||||
### Features
|
||||
- Extracts the HTML source code of the page.
|
||||
- Takes full-page screenshots of web pages.
|
||||
|
@ -44,5 +45,9 @@
|
|||
|
||||
### Notes
|
||||
- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
|
||||
|
||||
### Dropins
|
||||
This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below.
|
||||
|
||||
""",
|
||||
}
|
||||
|
|
|
@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
self._enrich_download_media(
|
||||
sb,
|
||||
to_enrich,
|
||||
css_selector=dropin.images_selectors(),
|
||||
js_css_selector=dropin.js_for_image_css_selectors(),
|
||||
max_media=self.max_download_images - downloaded_images,
|
||||
)
|
||||
self._enrich_download_media(
|
||||
sb,
|
||||
to_enrich,
|
||||
css_selector=dropin.video_selectors(),
|
||||
js_css_selector=dropin.js_for_video_css_selectors(),
|
||||
max_media=self.max_download_videos - downloaded_videos,
|
||||
)
|
||||
logger.info(f"ANTIBOT completed for {url_sample}")
|
||||
|
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
|
||||
|
||||
@logger.catch
|
||||
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
|
||||
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
|
||||
"""
|
||||
Downloads media from the page and adds them to the Metadata object.
|
||||
This method is called by the enrich method.
|
||||
|
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
url = to_enrich.get_url()
|
||||
all_urls = set()
|
||||
|
||||
sources = sb.execute_script(f"""
|
||||
return Array.from(document.querySelectorAll("{css_selector}"))
|
||||
.map(el => el.src || el.href)
|
||||
.filter(Boolean);
|
||||
""")
|
||||
sources = sb.execute_script(js_css_selector)
|
||||
# js_for_css_selectors
|
||||
for src in sources:
|
||||
if len(all_urls) >= max_media:
|
||||
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
from typing import Mapping
|
||||
from loguru import logger
|
||||
from seleniumbase import SB
|
||||
import yt_dlp
|
||||
|
@ -13,6 +14,19 @@ class Dropin:
|
|||
This class is designed to be a base class for drop-ins that can handle specific websites.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def documentation() -> Mapping[str, str]:
|
||||
"""
|
||||
Each Dropin should auto-document itself with this method.
|
||||
Return dictionary can include:
|
||||
- 'name': A string representing the name of the dropin.
|
||||
- 'description': A string describing the functionality of the dropin.
|
||||
- 'site': A string representing the site this dropin is for.
|
||||
- 'authentication': A dictionary with authentication example for the site.
|
||||
|
||||
"""
|
||||
return {}
|
||||
|
||||
def __init__(self, sb: SB, extractor: Extractor):
|
||||
"""
|
||||
Initialize the Dropin with the given SeleniumBase instance.
|
||||
|
@ -53,6 +67,26 @@ class Dropin:
|
|||
"""
|
||||
return "video, source"
|
||||
|
||||
def js_for_image_css_selectors(self) -> str:
|
||||
"""
|
||||
A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
|
||||
|
||||
You can overwrite this instead of `images_selector` for more control over scraped images.
|
||||
"""
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def js_for_video_css_selectors(self) -> str:
|
||||
"""
|
||||
A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
|
||||
|
||||
You can overwrite this instead of `video_selector` for more control over scraped videos.
|
||||
"""
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
"""
|
||||
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
|
||||
|
@ -66,7 +100,7 @@ class Dropin:
|
|||
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
|
||||
:return: A tuple (number of Images added, number of Videos added).
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
return 0, 0
|
||||
|
||||
def _get_username_password(self, site) -> tuple[str, str]:
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
|
||||
|
@ -13,6 +12,3 @@ class DefaultDropin(Dropin):
|
|||
|
||||
def open_page(self, url) -> bool:
|
||||
return True
|
||||
|
||||
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
|
||||
return 0, 0
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
from typing import Mapping
|
||||
from loguru import logger
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
|
||||
class LinkedinDropin(Dropin):
|
||||
"""
|
||||
A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def documentation() -> Mapping[str, str]:
|
||||
return {
|
||||
"name": "Linkedin Dropin",
|
||||
"description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.",
|
||||
"site": "linkedin.com",
|
||||
"authentication": {
|
||||
"linkedin.com": {
|
||||
"username": "email address or phone number",
|
||||
"password": "password",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
notifications_css_selector = 'a[href*="linkedin.com/notifications"]'
|
||||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
return "linkedin.com" in url
|
||||
|
||||
def js_for_image_css_selectors(self) -> str:
|
||||
get_all_css = "main img:not([src*='profile-displayphoto']):not([src*='profile-framedphoto'])"
|
||||
get_first_css = (
|
||||
"main img[src*='profile-framedphoto'], main img[src*='profile-displayphoto'], main img[src*='company-logo']"
|
||||
)
|
||||
|
||||
return f"""
|
||||
const all = Array.from(document.querySelectorAll("{get_all_css}")).map(el => el.src || el.href).filter(Boolean);
|
||||
const profile = document.querySelector("{get_first_css}");
|
||||
return all.concat(profile?.src || profile?.href || []).filter(Boolean);
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def video_selectors() -> str:
|
||||
# usually videos are from blob: but running the generic extractor should handle that
|
||||
return "main video"
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
if not self.sb.is_element_present(self.notifications_css_selector):
|
||||
self._login()
|
||||
if url != self.sb.get_current_url():
|
||||
self.sb.open(url)
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def _login(self) -> bool:
|
||||
if self.sb.is_text_visible("Sign in to view more content"):
|
||||
self.sb.click_link_text("Sign in", timeout=2)
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
else:
|
||||
self.sb.open("https://www.linkedin.com/login")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
|
||||
username, password = self._get_username_password("linkedin.com")
|
||||
logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username)
|
||||
self.sb.type("#username", username)
|
||||
self.sb.type("#password", password)
|
||||
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
|
||||
self.sb.click("button[type='submit']")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
# TODO: on suspicious login, LinkedIn may require an email verification code
|
||||
|
||||
if not self.sb.is_element_present(self.notifications_css_selector):
|
||||
self.sb.click_if_visible('button[aria-label="Dismiss"]', timeout=0.5)
|
|
@ -1,4 +1,5 @@
|
|||
from contextlib import suppress
|
||||
from typing import Mapping
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
|
@ -10,6 +11,19 @@ class RedditDropin(Dropin):
|
|||
A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
|
||||
"""
|
||||
|
||||
def documentation() -> Mapping[str, str]:
|
||||
return {
|
||||
"name": "Reddit Dropin",
|
||||
"description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.",
|
||||
"site": "reddit.com",
|
||||
"authentication": {
|
||||
"reddit.com": {
|
||||
"username": "email address or username",
|
||||
"password": "password",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
return "reddit.com" in url
|
||||
|
@ -36,7 +50,7 @@ class RedditDropin(Dropin):
|
|||
self._close_cookies_banner()
|
||||
|
||||
username, password = self._get_username_password("reddit.com")
|
||||
logger.debug("RedditDropin Logging in to VK with username: {}", username)
|
||||
logger.debug("RedditDropin Logging in to Reddit with username: {}", username)
|
||||
|
||||
self.sb.type("#login-username", username)
|
||||
self.sb.type("#login-password", password)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import re
|
||||
from typing import Mapping
|
||||
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
@ -16,6 +17,19 @@ class VkDropin(Dropin):
|
|||
CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
|
||||
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
|
||||
def documentation() -> Mapping[str, str]:
|
||||
return {
|
||||
"name": "VKontakte Dropin",
|
||||
"description": "Handles VKontakte posts and works without authentication for some content.",
|
||||
"site": "vk.com",
|
||||
"authentication": {
|
||||
"vk.com": {
|
||||
"username": "phone number with country code",
|
||||
"password": "password",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
return "vk.com" in url
|
||||
|
@ -39,7 +53,7 @@ class VkDropin(Dropin):
|
|||
|
||||
@logger.catch
|
||||
def _login(self) -> bool:
|
||||
# TODO: test method
|
||||
# TODO: test method, because current tests work without a login
|
||||
self.sb.open("https://vk.com")
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
if "/feed" in self.sb.get_current_url():
|
||||
|
|
|
@ -7,8 +7,7 @@ from slugify import slugify
|
|||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
class Twitter(GenericDropin):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import re
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
||||
from ipaddress import ip_address
|
||||
|
||||
|
||||
|
@ -53,7 +53,11 @@ def domain_for_url(url: str) -> str:
|
|||
|
||||
|
||||
def clean(url: str) -> str:
|
||||
return url
|
||||
TRACKERS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"}
|
||||
|
||||
parsed = urlparse(url)
|
||||
clean_qs = [(k, v) for k, v in parse_qsl(parsed.query) if k not in TRACKERS]
|
||||
return parsed._replace(query=urlencode(clean_qs)).geturl()
|
||||
|
||||
|
||||
def is_auth_wall(url: str) -> bool:
|
||||
|
@ -109,6 +113,8 @@ def is_relevant_url(url: str) -> bool:
|
|||
# reddit
|
||||
("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners
|
||||
("emoji.redditmedia.com",),
|
||||
# linkedin
|
||||
("static.licdn.com",),
|
||||
]
|
||||
|
||||
# TODO: make these globally configurable
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
# ANTIBOT reddit test credentials
|
||||
REDDIT_TEST_USERNAME=""
|
||||
REDDIT_TEST_PASSWORD=""
|
||||
|
||||
# ANTIBOT linkedin test credentials
|
||||
LINKEDIN_TEST_USERNAME=""
|
||||
LINKEDIN_TEST_PASSWORD=""
|
||||
|
||||
# twitter test credentials
|
||||
TWITTER_BEARER_TOKEN="TEST_KEY"
|
|
@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
|
|||
from typing import Dict, Tuple
|
||||
import hashlib
|
||||
|
||||
from loguru import logger
|
||||
import pytest
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
|
@ -20,6 +21,24 @@ from auto_archiver.core.module import ModuleFactory
|
|||
TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
|
||||
|
||||
|
||||
def pytest_configure():
|
||||
# load environment variables from .env.test file.
|
||||
env_path = os.path.join(os.path.dirname(__file__), ".env.test")
|
||||
if os.path.exists(env_path):
|
||||
with open(env_path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if "=" in line:
|
||||
key, value = line.split("=", 1)
|
||||
os.environ[key.strip()] = value.strip().lstrip('"').rstrip('"')
|
||||
else:
|
||||
logger.warning(
|
||||
f"Environment file {env_path} not found. Skipping loading environment variables, some tests may fail."
|
||||
)
|
||||
|
||||
|
||||
# don't check for ytdlp updates in tests
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_check_for_update(mocker):
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import os
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher
|
||||
|
@ -34,7 +35,18 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||
"save_to_pdf": False,
|
||||
"max_download_images": 0,
|
||||
"max_download_videos": 0,
|
||||
"user_data_dir": "./tests/tmp/user_data",
|
||||
"proxy": None,
|
||||
"authentication": {
|
||||
"reddit.com": {
|
||||
"username": os.environ.get("REDDIT_TEST_USERNAME"),
|
||||
"password": os.environ.get("REDDIT_TEST_PASSWORD"),
|
||||
},
|
||||
"linkedin.com": {
|
||||
"username": os.environ.get("LINKEDIN_TEST_USERNAME"),
|
||||
"password": os.environ.get("LINKEDIN_TEST_PASSWORD"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@pytest.mark.download
|
||||
|
@ -82,10 +94,10 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||
"""
|
||||
Test downloading pages with media.
|
||||
"""
|
||||
|
||||
self.extractor = setup_module(
|
||||
self.extractor_module,
|
||||
{
|
||||
self.config
|
||||
| {
|
||||
"save_to_pdf": True,
|
||||
"max_download_images": 5,
|
||||
"max_download_videos": "inf",
|
||||
|
@ -118,6 +130,50 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||
f"Expected media with id '{expected_id}' not found"
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"),
|
||||
reason="No Reddit test credentials provided",
|
||||
)
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_title,in_text,image_count,video_count",
|
||||
[
|
||||
(
|
||||
"https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/",
|
||||
"Duy tran is the owner and prime wood work artist",
|
||||
" Created Jan 26, 2015",
|
||||
4,
|
||||
0,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_reddit_download_with_login(
|
||||
self, setup_module, make_item, url, in_title, in_text, image_count, video_count
|
||||
):
|
||||
self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("LINKEDIN_TEST_USERNAME") or not os.environ.get("LINKEDIN_TEST_PASSWORD"),
|
||||
reason="No LinkedIn test credentials provided",
|
||||
)
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_title,in_text,image_count,video_count",
|
||||
[
|
||||
(
|
||||
"https://www.linkedin.com/posts/bellingcat_live-podcast-bellingcat-activity-7331725631799398400-xocM/",
|
||||
"Post",
|
||||
"It takes time to go from hunch to reporting...",
|
||||
2,
|
||||
0,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_linkedin_download_with_login(
|
||||
self, setup_module, make_item, url, in_title, in_text, image_count, video_count
|
||||
):
|
||||
self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_html",
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
from auto_archiver.utils.url import (
|
||||
clean,
|
||||
is_auth_wall,
|
||||
check_url_or_raise,
|
||||
domain_for_url,
|
||||
|
@ -158,3 +159,39 @@ def test_twitter_best_quality_url(url, best_quality):
|
|||
)
|
||||
def test_get_media_url_best_quality(input_url, expected_url):
|
||||
assert get_media_url_best_quality(input_url) == expected_url
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_url,expected_url",
|
||||
[
|
||||
# No trackers present
|
||||
("https://example.com/page?foo=bar&baz=qux", "https://example.com/page?foo=bar&baz=qux"),
|
||||
# Single tracker present
|
||||
("https://example.com/page?utm_source=google&foo=bar", "https://example.com/page?foo=bar"),
|
||||
# Multiple trackers present
|
||||
("https://example.com/page?utm_source=google&utm_medium=email&utm_campaign=spring", "https://example.com/page"),
|
||||
# Trackers mixed with other params
|
||||
(
|
||||
"https://example.com/page?foo=bar&utm_content=abc&baz=qux&gclid=123",
|
||||
"https://example.com/page?foo=bar&baz=qux",
|
||||
),
|
||||
# Only trackers present
|
||||
("https://example.com/page?utm_source=google&gclid=123", "https://example.com/page"),
|
||||
# No query string
|
||||
("https://example.com/page", "https://example.com/page"),
|
||||
# Trackers in fragment (should not be removed)
|
||||
("https://example.com/page#utm_source=google", "https://example.com/page#utm_source=google"),
|
||||
# Trackers after fragment
|
||||
("https://example.com/page?utm_source=google#section-1", "https://example.com/page#section-1"),
|
||||
# Trackers with empty value
|
||||
("https://example.com/page?utm_source=&foo=bar", "https://example.com/page?foo=bar"),
|
||||
# Trackers with multiple values
|
||||
("https://example.com/page?utm_source=google&utm_source=bing&foo=bar", "https://example.com/page?foo=bar"),
|
||||
# Trackers with encoded values
|
||||
("https://example.com/page?utm_source=google%20ads&foo=bar", "https://example.com/page?foo=bar"),
|
||||
# Unrelated param with similar name
|
||||
("https://example.com/page?utm_sourc=keepme&foo=bar", "https://example.com/page?utm_sourc=keepme&foo=bar"),
|
||||
],
|
||||
)
|
||||
def test_clean_removes_trackers(input_url, expected_url):
|
||||
assert clean(input_url) == expected_url
|
||||
|
|
Ładowanie…
Reference in New Issue