diff --git a/.gitignore b/.gitignore index 35eee83..d14e3bb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ tmp*/ temp/ .env* +!.env*.example .DS_Store expmt/ service_account.json @@ -37,4 +38,5 @@ docs/source/modules/autogen/ scripts/settings_page.html scripts/settings/src/schema.json .vite -downloaded_files \ No newline at end of file +downloaded_files +latest_logs \ No newline at end of file diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index bfddd29..f9cb13d 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -47,7 +47,6 @@ def generate_module_docs(): for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)): # generate the markdown file from the __manifest__.py file. - manifest = module.manifest for type in manifest["type"]: modules_by_type.setdefault(type, []).append(module) @@ -64,6 +63,27 @@ def generate_module_docs(): """ steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"]) + if manifest.get("autodoc_dropins"): + loaded_module = module.load({}) + dropins = loaded_module.load_dropins() + dropin_str = "\n##### Available Dropins\n" + for dropin in dropins: + if not (ddoc := dropin.documentation()): + continue + dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n" + dropin_str += f"{ddoc.get('description')}\n\n" + if ddoc.get("site"): + dropin_str += f"**Site**: {ddoc['site']}\n\n" + if dauth := ddoc.get("authentication"): + dropin_str += "**YAML configuration**:\n" + dropin_auth_yaml = "authentication:\n...\n" + for site, creds in dauth.items(): + dropin_auth_yaml += f" {site}:\n" + for k, v in creds.items(): + dropin_auth_yaml += f' {k}: "{v}"\n' + dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n" + readme_str += dropin_str + if not manifest["configs"]: config_string = f"# No configuration options for {module.name}.*\n" else: diff --git a/docs/source/development/testing.md b/docs/source/development/testing.md index 5de9574..290592c 100644 --- a/docs/source/development/testing.md +++ b/docs/source/development/testing.md @@ -3,14 +3,14 @@ `pytest` is used for testing. There are two main types of tests: 1. 'core' tests which should be run on every change -2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed. +2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed, they take longer. ## Running Tests 1. Make sure you've installed the dev dependencies with `pytest install --with dev` 2. Tests can be run as follows: -``` +```{code} bash #### Command prefix of 'poetry run' removed here for simplicity # run core tests pytest -ra -v -m "not download" @@ -18,4 +18,15 @@ pytest -ra -v -m "not download" pytest -ra -v -m "download" # run all tests pytest -ra -v + + +# run a specific test file +pytest -ra -v tests/test_file.py +# run a specific test function +pytest -ra -v tests/test_file.py::test_function_name +``` + +3. Some tests require environment variables to be set. You can use the example `.env.test.example` file as a template. Copy it to `.env.test` and fill in the required values. This file will be loaded automatically by `pytest`. +```{code} bash +cp .env.test.example .env.test ``` \ No newline at end of file diff --git a/docs/source/how_to/authentication_how_to.md b/docs/source/how_to/01_authentication_how_to.md similarity index 100% rename from docs/source/how_to/authentication_how_to.md rename to docs/source/how_to/01_authentication_how_to.md diff --git a/docs/source/how_to/gsheets_setup.md b/docs/source/how_to/02_gsheets_setup.md similarity index 100% rename from docs/source/how_to/gsheets_setup.md rename to docs/source/how_to/02_gsheets_setup.md diff --git a/docs/source/how_to/logging.md b/docs/source/how_to/03_logging.md similarity index 100% rename from docs/source/how_to/logging.md rename to docs/source/how_to/03_logging.md diff --git a/docs/source/how_to/run_instagrapi_server.md b/docs/source/how_to/04_run_instagrapi_server.md similarity index 100% rename from docs/source/how_to/run_instagrapi_server.md rename to docs/source/how_to/04_run_instagrapi_server.md diff --git a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md b/docs/source/how_to/05_upgrading_to_1_1_0.md similarity index 59% rename from docs/source/how_to/upgrading_1_0_1_to_1_1_0.md rename to docs/source/how_to/05_upgrading_to_1_1_0.md index 81e00e2..57bc253 100644 --- a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md +++ b/docs/source/how_to/05_upgrading_to_1_1_0.md @@ -15,19 +15,29 @@ We have dropped the `vk_extractor` because of problems in a project we relied on Module 'vk_extractor' not found. Are you sure it's installed/exists? ``` +## Dropping `screenshot_enricher` module +We have dropped the `screenshot_enricher` module because a new `antibot_extractor_enricher` (see below) module replaces its functionality more robustly and with less dependency hassle on geckodriver/firefox. You will need to remove it from your configuration file, otherwise you will see an error like: + +```{code} console +Module 'screenshot_enricher' not found. Are you sure it's installed/exists? +``` + + ## New `antibot_extractor_enricher` module and VkDropin -We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this: +We have added a new [`antibot_extractor_enricher`](../modules/autogen/extractor/antibot_extractor_enricher.md) module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this: ```{code} yaml steps: - extractors: - - antibot_extractor_enricher + extractors: + - antibot_extractor_enricher - # or alternatively, if you want to use it as an enricher: - enrichers: - - antibot_extractor_enricher + # or alternatively, if you want to use it as an enricher: + enrichers: + - antibot_extractor_enricher ``` +It will take a full page screenshot, a PDF capture, extract HTML source code, and any other relevant media. + It comes with Dropins that we will be adding and maintaining. > Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though. @@ -36,9 +46,9 @@ One such Dropin is the VkDropin which uses this automated browser to access VKon ```{code} yaml authentication: - vk: - username: your_username - password: your_password + vk.com: + username: your_username + password: your_password ``` See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin. \ No newline at end of file diff --git a/docs/source/how_to/new_config_format.md b/docs/source/how_to/06_new_config_format.md similarity index 100% rename from docs/source/how_to/new_config_format.md rename to docs/source/how_to/06_new_config_format.md diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index ca3359d..5dca928 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -77,6 +77,8 @@ class Extractor(BaseModule): downloads a URL to provided filename, or inferred from URL, returns local filename Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful. """ + if any(url.startswith(x) for x in ["blob:", "data:"]): + return None, url if try_best_quality else None if try_best_quality: with suppress(Exception): @@ -116,6 +118,8 @@ class Extractor(BaseModule): except requests.RequestException as e: logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}") + if try_best_quality: + return None, url @abstractmethod def download(self, item: Metadata) -> Metadata | False: diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 0bff376..66073a7 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -34,7 +34,7 @@ from .config import ( from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .consts import MODULE_TYPES, SetupError -from auto_archiver.utils.url import check_url_or_raise +from auto_archiver.utils.url import check_url_or_raise, clean if TYPE_CHECKING: from .base_module import BaseModule @@ -572,7 +572,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ raise e # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs - url = original_url + url = clean(original_url) for a in self.extractors: url = a.sanitize_url(url) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index e2bcad9..f08547b 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -31,11 +31,12 @@ "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'", }, }, + "autodoc_dropins": True, "description": """ Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha. - Still in trial development, please report any issues or suggestions via GitHub Issues. - + > ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues). + ### Features - Extracts the HTML source code of the page. - Takes full-page screenshots of web pages. @@ -44,5 +45,9 @@ ### Notes - Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary. + + ### Dropins + This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below. + """, } diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 1982389..04e4702 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher): self._enrich_download_media( sb, to_enrich, - css_selector=dropin.images_selectors(), + js_css_selector=dropin.js_for_image_css_selectors(), max_media=self.max_download_images - downloaded_images, ) self._enrich_download_media( sb, to_enrich, - css_selector=dropin.video_selectors(), + js_css_selector=dropin.js_for_video_css_selectors(), max_media=self.max_download_videos - downloaded_videos, ) logger.info(f"ANTIBOT completed for {url_sample}") @@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): to_enrich.add_media(Media(filename=pdf_filename), id="pdf") @logger.catch - def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int): + def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int): """ Downloads media from the page and adds them to the Metadata object. This method is called by the enrich method. @@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher): url = to_enrich.get_url() all_urls = set() - sources = sb.execute_script(f""" - return Array.from(document.querySelectorAll("{css_selector}")) - .map(el => el.src || el.href) - .filter(Boolean); - """) + sources = sb.execute_script(js_css_selector) + # js_for_css_selectors for src in sources: if len(all_urls) >= max_media: logger.debug(f"Reached max download limit of {max_media} images/videos.") diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 15c2e28..d4b255d 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,4 +1,5 @@ import os +from typing import Mapping from loguru import logger from seleniumbase import SB import yt_dlp @@ -13,6 +14,19 @@ class Dropin: This class is designed to be a base class for drop-ins that can handle specific websites. """ + @staticmethod + def documentation() -> Mapping[str, str]: + """ + Each Dropin should auto-document itself with this method. + Return dictionary can include: + - 'name': A string representing the name of the dropin. + - 'description': A string describing the functionality of the dropin. + - 'site': A string representing the site this dropin is for. + - 'authentication': A dictionary with authentication example for the site. + + """ + return {} + def __init__(self, sb: SB, extractor: Extractor): """ Initialize the Dropin with the given SeleniumBase instance. @@ -53,6 +67,26 @@ class Dropin: """ return "video, source" + def js_for_image_css_selectors(self) -> str: + """ + A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection. + + You can overwrite this instead of `images_selector` for more control over scraped images. + """ + return f""" + return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean); + """ + + def js_for_video_css_selectors(self) -> str: + """ + A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection. + + You can overwrite this instead of `video_selector` for more control over scraped videos. + """ + return f""" + return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean); + """ + def open_page(self, url) -> bool: """ Make sure the page is opened, even if it requires authentication, captcha solving, etc. @@ -66,7 +100,7 @@ class Dropin: Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object. :return: A tuple (number of Images added, number of Videos added). """ - raise NotImplementedError("This method should be implemented in the subclass") + return 0, 0 def _get_username_password(self, site) -> tuple[str, str]: """ diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py index c5c865a..72ec3f0 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py @@ -1,4 +1,3 @@ -from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -13,6 +12,3 @@ class DefaultDropin(Dropin): def open_page(self, url) -> bool: return True - - def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: - return 0, 0 diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py new file mode 100644 index 0000000..336b630 --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py @@ -0,0 +1,74 @@ +from typing import Mapping +from loguru import logger +from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin + + +class LinkedinDropin(Dropin): + """ + A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module. + """ + + @staticmethod + def documentation() -> Mapping[str, str]: + return { + "name": "Linkedin Dropin", + "description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.", + "site": "linkedin.com", + "authentication": { + "linkedin.com": { + "username": "email address or phone number", + "password": "password", + } + }, + } + + notifications_css_selector = 'a[href*="linkedin.com/notifications"]' + + @staticmethod + def suitable(url: str) -> bool: + return "linkedin.com" in url + + def js_for_image_css_selectors(self) -> str: + get_all_css = "main img:not([src*='profile-displayphoto']):not([src*='profile-framedphoto'])" + get_first_css = ( + "main img[src*='profile-framedphoto'], main img[src*='profile-displayphoto'], main img[src*='company-logo']" + ) + + return f""" + const all = Array.from(document.querySelectorAll("{get_all_css}")).map(el => el.src || el.href).filter(Boolean); + const profile = document.querySelector("{get_first_css}"); + return all.concat(profile?.src || profile?.href || []).filter(Boolean); + """ + + @staticmethod + def video_selectors() -> str: + # usually videos are from blob: but running the generic extractor should handle that + return "main video" + + def open_page(self, url) -> bool: + if not self.sb.is_element_present(self.notifications_css_selector): + self._login() + if url != self.sb.get_current_url(): + self.sb.open(url) + return True + + @logger.catch + def _login(self) -> bool: + if self.sb.is_text_visible("Sign in to view more content"): + self.sb.click_link_text("Sign in", timeout=2) + self.sb.wait_for_ready_state_complete() + else: + self.sb.open("https://www.linkedin.com/login") + self.sb.wait_for_ready_state_complete() + + username, password = self._get_username_password("linkedin.com") + logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username) + self.sb.type("#username", username) + self.sb.type("#password", password) + self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5) + self.sb.click("button[type='submit']") + self.sb.wait_for_ready_state_complete() + # TODO: on suspicious login, LinkedIn may require an email verification code + + if not self.sb.is_element_present(self.notifications_css_selector): + self.sb.click_if_visible('button[aria-label="Dismiss"]', timeout=0.5) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 44d572b..3f699b6 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -1,4 +1,5 @@ from contextlib import suppress +from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -10,6 +11,19 @@ class RedditDropin(Dropin): A class to handle Reddit drop-in functionality for the antibot extractor enricher module. """ + def documentation() -> Mapping[str, str]: + return { + "name": "Reddit Dropin", + "description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.", + "site": "reddit.com", + "authentication": { + "reddit.com": { + "username": "email address or username", + "password": "password", + } + }, + } + @staticmethod def suitable(url: str) -> bool: return "reddit.com" in url @@ -36,7 +50,7 @@ class RedditDropin(Dropin): self._close_cookies_banner() username, password = self._get_username_password("reddit.com") - logger.debug("RedditDropin Logging in to VK with username: {}", username) + logger.debug("RedditDropin Logging in to Reddit with username: {}", username) self.sb.type("#login-username", username) self.sb.type("#login-password", password) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 6888727..3f92eda 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -1,4 +1,5 @@ import re +from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -16,6 +17,19 @@ class VkDropin(Dropin): CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") + def documentation() -> Mapping[str, str]: + return { + "name": "VKontakte Dropin", + "description": "Handles VKontakte posts and works without authentication for some content.", + "site": "vk.com", + "authentication": { + "vk.com": { + "username": "phone number with country code", + "password": "password", + } + }, + } + @staticmethod def suitable(url: str) -> bool: return "vk.com" in url @@ -39,7 +53,7 @@ class VkDropin(Dropin): @logger.catch def _login(self) -> bool: - # TODO: test method + # TODO: test method, because current tests work without a login self.sb.open("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url(): diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 189a7e6..9006e57 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -7,8 +7,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import url as UrlUtil, get_datetime_from_str from auto_archiver.core.extractor import Extractor - -from .dropin import GenericDropin, InfoExtractor +from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor class Twitter(GenericDropin): diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 2bb19cf..79438bc 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -1,5 +1,5 @@ import re -from urllib.parse import urlparse, urlunparse +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse from ipaddress import ip_address @@ -53,7 +53,11 @@ def domain_for_url(url: str) -> str: def clean(url: str) -> str: - return url + TRACKERS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"} + + parsed = urlparse(url) + clean_qs = [(k, v) for k, v in parse_qsl(parsed.query) if k not in TRACKERS] + return parsed._replace(query=urlencode(clean_qs)).geturl() def is_auth_wall(url: str) -> bool: @@ -109,6 +113,8 @@ def is_relevant_url(url: str) -> bool: # reddit ("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners ("emoji.redditmedia.com",), + # linkedin + ("static.licdn.com",), ] # TODO: make these globally configurable diff --git a/tests/.env.test.example b/tests/.env.test.example new file mode 100644 index 0000000..367d3f5 --- /dev/null +++ b/tests/.env.test.example @@ -0,0 +1,10 @@ +# ANTIBOT reddit test credentials +REDDIT_TEST_USERNAME="" +REDDIT_TEST_PASSWORD="" + +# ANTIBOT linkedin test credentials +LINKEDIN_TEST_USERNAME="" +LINKEDIN_TEST_PASSWORD="" + +# twitter test credentials +TWITTER_BEARER_TOKEN="TEST_KEY" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index ba7b48d..a54f01d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib +from loguru import logger import pytest from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.module import ModuleFactory @@ -20,6 +21,24 @@ from auto_archiver.core.module import ModuleFactory TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"] +def pytest_configure(): + # load environment variables from .env.test file. + env_path = os.path.join(os.path.dirname(__file__), ".env.test") + if os.path.exists(env_path): + with open(env_path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" in line: + key, value = line.split("=", 1) + os.environ[key.strip()] = value.strip().lstrip('"').rstrip('"') + else: + logger.warning( + f"Environment file {env_path} not found. Skipping loading environment variables, some tests may fail." + ) + + # don't check for ytdlp updates in tests @pytest.fixture(autouse=True) def skip_check_for_update(mocker): diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 06107b4..a8a51dd 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -1,3 +1,4 @@ +import os import pytest from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher @@ -34,7 +35,18 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "save_to_pdf": False, "max_download_images": 0, "max_download_videos": 0, + "user_data_dir": "./tests/tmp/user_data", "proxy": None, + "authentication": { + "reddit.com": { + "username": os.environ.get("REDDIT_TEST_USERNAME"), + "password": os.environ.get("REDDIT_TEST_PASSWORD"), + }, + "linkedin.com": { + "username": os.environ.get("LINKEDIN_TEST_USERNAME"), + "password": os.environ.get("LINKEDIN_TEST_PASSWORD"), + }, + }, } @pytest.mark.download @@ -82,10 +94,10 @@ class TestAntibotExtractorEnricher(TestExtractorBase): """ Test downloading pages with media. """ - self.extractor = setup_module( self.extractor_module, - { + self.config + | { "save_to_pdf": True, "max_download_images": 5, "max_download_videos": "inf", @@ -118,6 +130,50 @@ class TestAntibotExtractorEnricher(TestExtractorBase): f"Expected media with id '{expected_id}' not found" ) + @pytest.mark.skipif( + not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"), + reason="No Reddit test credentials provided", + ) + @pytest.mark.download + @pytest.mark.parametrize( + "url,in_title,in_text,image_count,video_count", + [ + ( + "https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/", + "Duy tran is the owner and prime wood work artist", + " Created Jan 26, 2015", + 4, + 0, + ), + ], + ) + def test_reddit_download_with_login( + self, setup_module, make_item, url, in_title, in_text, image_count, video_count + ): + self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count) + + @pytest.mark.skipif( + not os.environ.get("LINKEDIN_TEST_USERNAME") or not os.environ.get("LINKEDIN_TEST_PASSWORD"), + reason="No LinkedIn test credentials provided", + ) + @pytest.mark.download + @pytest.mark.parametrize( + "url,in_title,in_text,image_count,video_count", + [ + ( + "https://www.linkedin.com/posts/bellingcat_live-podcast-bellingcat-activity-7331725631799398400-xocM/", + "Post", + "It takes time to go from hunch to reporting...", + 2, + 0, + ), + ], + ) + def test_linkedin_download_with_login( + self, setup_module, make_item, url, in_title, in_text, image_count, video_count + ): + self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count) + @pytest.mark.download @pytest.mark.parametrize( "url,in_html", diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index df8e0f3..2c77122 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -1,5 +1,6 @@ import pytest from auto_archiver.utils.url import ( + clean, is_auth_wall, check_url_or_raise, domain_for_url, @@ -158,3 +159,39 @@ def test_twitter_best_quality_url(url, best_quality): ) def test_get_media_url_best_quality(input_url, expected_url): assert get_media_url_best_quality(input_url) == expected_url + + +@pytest.mark.parametrize( + "input_url,expected_url", + [ + # No trackers present + ("https://example.com/page?foo=bar&baz=qux", "https://example.com/page?foo=bar&baz=qux"), + # Single tracker present + ("https://example.com/page?utm_source=google&foo=bar", "https://example.com/page?foo=bar"), + # Multiple trackers present + ("https://example.com/page?utm_source=google&utm_medium=email&utm_campaign=spring", "https://example.com/page"), + # Trackers mixed with other params + ( + "https://example.com/page?foo=bar&utm_content=abc&baz=qux&gclid=123", + "https://example.com/page?foo=bar&baz=qux", + ), + # Only trackers present + ("https://example.com/page?utm_source=google&gclid=123", "https://example.com/page"), + # No query string + ("https://example.com/page", "https://example.com/page"), + # Trackers in fragment (should not be removed) + ("https://example.com/page#utm_source=google", "https://example.com/page#utm_source=google"), + # Trackers after fragment + ("https://example.com/page?utm_source=google#section-1", "https://example.com/page#section-1"), + # Trackers with empty value + ("https://example.com/page?utm_source=&foo=bar", "https://example.com/page?foo=bar"), + # Trackers with multiple values + ("https://example.com/page?utm_source=google&utm_source=bing&foo=bar", "https://example.com/page?foo=bar"), + # Trackers with encoded values + ("https://example.com/page?utm_source=google%20ads&foo=bar", "https://example.com/page?foo=bar"), + # Unrelated param with similar name + ("https://example.com/page?utm_sourc=keepme&foo=bar", "https://example.com/page?utm_sourc=keepme&foo=bar"), + ], +) +def test_clean_removes_trackers(input_url, expected_url): + assert clean(input_url) == expected_url