From 1039e9631f0c13cca0f23bac4710b7bc6a7be1b7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:22:23 +0100 Subject: [PATCH 01/11] new reddit tests with .env.test --- .gitignore | 1 + docs/source/development/testing.md | 15 +++++++++++++-- .../dropins/reddit.py | 2 +- tests/.env.test.example | 6 ++++++ tests/conftest.py | 19 +++++++++++++++++++ .../test_antibot_extractor_enricher.py | 19 +++++++++++++++++-- 6 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 tests/.env.test.example diff --git a/.gitignore b/.gitignore index 35eee83..2c579fa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ tmp*/ temp/ .env* +!.env*.example .DS_Store expmt/ service_account.json diff --git a/docs/source/development/testing.md b/docs/source/development/testing.md index 5de9574..290592c 100644 --- a/docs/source/development/testing.md +++ b/docs/source/development/testing.md @@ -3,14 +3,14 @@ `pytest` is used for testing. There are two main types of tests: 1. 'core' tests which should be run on every change -2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed. +2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed, they take longer. ## Running Tests 1. Make sure you've installed the dev dependencies with `pytest install --with dev` 2. Tests can be run as follows: -``` +```{code} bash #### Command prefix of 'poetry run' removed here for simplicity # run core tests pytest -ra -v -m "not download" @@ -18,4 +18,15 @@ pytest -ra -v -m "not download" pytest -ra -v -m "download" # run all tests pytest -ra -v + + +# run a specific test file +pytest -ra -v tests/test_file.py +# run a specific test function +pytest -ra -v tests/test_file.py::test_function_name +``` + +3. Some tests require environment variables to be set. You can use the example `.env.test.example` file as a template. Copy it to `.env.test` and fill in the required values. This file will be loaded automatically by `pytest`. +```{code} bash +cp .env.test.example .env.test ``` \ No newline at end of file diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 44d572b..78bc510 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -36,7 +36,7 @@ class RedditDropin(Dropin): self._close_cookies_banner() username, password = self._get_username_password("reddit.com") - logger.debug("RedditDropin Logging in to VK with username: {}", username) + logger.debug("RedditDropin Logging in to Reddit with username: {}", username) self.sb.type("#login-username", username) self.sb.type("#login-password", password) diff --git a/tests/.env.test.example b/tests/.env.test.example new file mode 100644 index 0000000..2e058ab --- /dev/null +++ b/tests/.env.test.example @@ -0,0 +1,6 @@ +# reddit test credentials +REDDIT_TEST_USERNAME="" +REDDIT_TEST_PASSWORD="" + +# twitter test credentials +TWITTER_BEARER_TOKEN="TEST_KEY" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index ba7b48d..a54f01d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib +from loguru import logger import pytest from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.module import ModuleFactory @@ -20,6 +21,24 @@ from auto_archiver.core.module import ModuleFactory TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"] +def pytest_configure(): + # load environment variables from .env.test file. + env_path = os.path.join(os.path.dirname(__file__), ".env.test") + if os.path.exists(env_path): + with open(env_path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" in line: + key, value = line.split("=", 1) + os.environ[key.strip()] = value.strip().lstrip('"').rstrip('"') + else: + logger.warning( + f"Environment file {env_path} not found. Skipping loading environment variables, some tests may fail." + ) + + # don't check for ytdlp updates in tests @pytest.fixture(autouse=True) def skip_check_for_update(mocker): diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 06107b4..c0044b9 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -1,3 +1,4 @@ +import os import pytest from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher @@ -34,7 +35,14 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "save_to_pdf": False, "max_download_images": 0, "max_download_videos": 0, + "user_data_dir": "./tests/tmp/user_data", "proxy": None, + "authentication": { + "reddit.com": { + "username": os.environ.get("REDDIT_TEST_USERNAME"), + "password": os.environ.get("REDDIT_TEST_PASSWORD"), + } + }, } @pytest.mark.download @@ -76,16 +84,23 @@ class TestAntibotExtractorEnricher(TestExtractorBase): 5, 0, ), + ( + "https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/", + "Duy tran is the owner and prime wood work artist", + " Created Jan 26, 2015", + 4, + 0, + ), ], ) def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count): """ Test downloading pages with media. """ - self.extractor = setup_module( self.extractor_module, - { + self.config + | { "save_to_pdf": True, "max_download_images": 5, "max_download_videos": "inf", From 69ddb7214676113b3670e3afa3bd2b1d4b5d72df Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:27:11 +0100 Subject: [PATCH 02/11] separate reddit tests --- .../test_antibot_extractor_enricher.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index c0044b9..642d0ec 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -84,13 +84,6 @@ class TestAntibotExtractorEnricher(TestExtractorBase): 5, 0, ), - ( - "https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/", - "Duy tran is the owner and prime wood work artist", - " Created Jan 26, 2015", - 4, - 0, - ), ], ) def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count): @@ -133,6 +126,28 @@ class TestAntibotExtractorEnricher(TestExtractorBase): f"Expected media with id '{expected_id}' not found" ) + @pytest.mark.skipif( + not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"), + reason="No Reddit test credentials provided", + ) + @pytest.mark.download + @pytest.mark.parametrize( + "url,in_title,in_text,image_count,video_count", + [ + ( + "https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/", + "Duy tran is the owner and prime wood work artist", + " Created Jan 26, 2015", + 4, + 0, + ), + ], + ) + def test_reddit_download_with_login( + self, setup_module, make_item, url, in_title, in_text, image_count, video_count + ): + self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count) + @pytest.mark.download @pytest.mark.parametrize( "url,in_html", From 3cf51dd8744d98848b31dee0e095d6167a61ca40 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:56:42 +0100 Subject: [PATCH 03/11] adds tracker remove feature and tests --- src/auto_archiver/core/orchestrator.py | 4 +-- src/auto_archiver/utils/url.py | 8 ++++-- tests/utils/test_urls.py | 37 ++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 0bff376..66073a7 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -34,7 +34,7 @@ from .config import ( from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .consts import MODULE_TYPES, SetupError -from auto_archiver.utils.url import check_url_or_raise +from auto_archiver.utils.url import check_url_or_raise, clean if TYPE_CHECKING: from .base_module import BaseModule @@ -572,7 +572,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ raise e # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs - url = original_url + url = clean(original_url) for a in self.extractors: url = a.sanitize_url(url) diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 2bb19cf..a44a91d 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -1,5 +1,5 @@ import re -from urllib.parse import urlparse, urlunparse +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse from ipaddress import ip_address @@ -53,7 +53,11 @@ def domain_for_url(url: str) -> str: def clean(url: str) -> str: - return url + TRACKERS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"} + + parsed = urlparse(url) + clean_qs = [(k, v) for k, v in parse_qsl(parsed.query) if k not in TRACKERS] + return parsed._replace(query=urlencode(clean_qs)).geturl() def is_auth_wall(url: str) -> bool: diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index df8e0f3..2c77122 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -1,5 +1,6 @@ import pytest from auto_archiver.utils.url import ( + clean, is_auth_wall, check_url_or_raise, domain_for_url, @@ -158,3 +159,39 @@ def test_twitter_best_quality_url(url, best_quality): ) def test_get_media_url_best_quality(input_url, expected_url): assert get_media_url_best_quality(input_url) == expected_url + + +@pytest.mark.parametrize( + "input_url,expected_url", + [ + # No trackers present + ("https://example.com/page?foo=bar&baz=qux", "https://example.com/page?foo=bar&baz=qux"), + # Single tracker present + ("https://example.com/page?utm_source=google&foo=bar", "https://example.com/page?foo=bar"), + # Multiple trackers present + ("https://example.com/page?utm_source=google&utm_medium=email&utm_campaign=spring", "https://example.com/page"), + # Trackers mixed with other params + ( + "https://example.com/page?foo=bar&utm_content=abc&baz=qux&gclid=123", + "https://example.com/page?foo=bar&baz=qux", + ), + # Only trackers present + ("https://example.com/page?utm_source=google&gclid=123", "https://example.com/page"), + # No query string + ("https://example.com/page", "https://example.com/page"), + # Trackers in fragment (should not be removed) + ("https://example.com/page#utm_source=google", "https://example.com/page#utm_source=google"), + # Trackers after fragment + ("https://example.com/page?utm_source=google#section-1", "https://example.com/page#section-1"), + # Trackers with empty value + ("https://example.com/page?utm_source=&foo=bar", "https://example.com/page?foo=bar"), + # Trackers with multiple values + ("https://example.com/page?utm_source=google&utm_source=bing&foo=bar", "https://example.com/page?foo=bar"), + # Trackers with encoded values + ("https://example.com/page?utm_source=google%20ads&foo=bar", "https://example.com/page?foo=bar"), + # Unrelated param with similar name + ("https://example.com/page?utm_sourc=keepme&foo=bar", "https://example.com/page?utm_sourc=keepme&foo=bar"), + ], +) +def test_clean_removes_trackers(input_url, expected_url): + assert clean(input_url) == expected_url From e567bba6f919069b7cfb26fe2b4398b01b05a4a0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 13:37:03 +0100 Subject: [PATCH 04/11] improves docs for how-to and migrations --- ..._how_to.md => 01_authentication_how_to.md} | 0 .../{gsheets_setup.md => 02_gsheets_setup.md} | 0 .../how_to/{logging.md => 03_logging.md} | 0 ..._server.md => 04_run_instagrapi_server.md} | 0 ...1_to_1_1_0.md => 05_upgrading_to_1_1_0.md} | 28 +++++++++++++------ ...nfig_format.md => 06_new_config_format.md} | 0 .../modules/generic_extractor/twitter.py | 3 +- 7 files changed, 20 insertions(+), 11 deletions(-) rename docs/source/how_to/{authentication_how_to.md => 01_authentication_how_to.md} (100%) rename docs/source/how_to/{gsheets_setup.md => 02_gsheets_setup.md} (100%) rename docs/source/how_to/{logging.md => 03_logging.md} (100%) rename docs/source/how_to/{run_instagrapi_server.md => 04_run_instagrapi_server.md} (100%) rename docs/source/how_to/{upgrading_1_0_1_to_1_1_0.md => 05_upgrading_to_1_1_0.md} (59%) rename docs/source/how_to/{new_config_format.md => 06_new_config_format.md} (100%) diff --git a/docs/source/how_to/authentication_how_to.md b/docs/source/how_to/01_authentication_how_to.md similarity index 100% rename from docs/source/how_to/authentication_how_to.md rename to docs/source/how_to/01_authentication_how_to.md diff --git a/docs/source/how_to/gsheets_setup.md b/docs/source/how_to/02_gsheets_setup.md similarity index 100% rename from docs/source/how_to/gsheets_setup.md rename to docs/source/how_to/02_gsheets_setup.md diff --git a/docs/source/how_to/logging.md b/docs/source/how_to/03_logging.md similarity index 100% rename from docs/source/how_to/logging.md rename to docs/source/how_to/03_logging.md diff --git a/docs/source/how_to/run_instagrapi_server.md b/docs/source/how_to/04_run_instagrapi_server.md similarity index 100% rename from docs/source/how_to/run_instagrapi_server.md rename to docs/source/how_to/04_run_instagrapi_server.md diff --git a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md b/docs/source/how_to/05_upgrading_to_1_1_0.md similarity index 59% rename from docs/source/how_to/upgrading_1_0_1_to_1_1_0.md rename to docs/source/how_to/05_upgrading_to_1_1_0.md index 81e00e2..57bc253 100644 --- a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md +++ b/docs/source/how_to/05_upgrading_to_1_1_0.md @@ -15,19 +15,29 @@ We have dropped the `vk_extractor` because of problems in a project we relied on Module 'vk_extractor' not found. Are you sure it's installed/exists? ``` +## Dropping `screenshot_enricher` module +We have dropped the `screenshot_enricher` module because a new `antibot_extractor_enricher` (see below) module replaces its functionality more robustly and with less dependency hassle on geckodriver/firefox. You will need to remove it from your configuration file, otherwise you will see an error like: + +```{code} console +Module 'screenshot_enricher' not found. Are you sure it's installed/exists? +``` + + ## New `antibot_extractor_enricher` module and VkDropin -We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this: +We have added a new [`antibot_extractor_enricher`](../modules/autogen/extractor/antibot_extractor_enricher.md) module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this: ```{code} yaml steps: - extractors: - - antibot_extractor_enricher + extractors: + - antibot_extractor_enricher - # or alternatively, if you want to use it as an enricher: - enrichers: - - antibot_extractor_enricher + # or alternatively, if you want to use it as an enricher: + enrichers: + - antibot_extractor_enricher ``` +It will take a full page screenshot, a PDF capture, extract HTML source code, and any other relevant media. + It comes with Dropins that we will be adding and maintaining. > Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though. @@ -36,9 +46,9 @@ One such Dropin is the VkDropin which uses this automated browser to access VKon ```{code} yaml authentication: - vk: - username: your_username - password: your_password + vk.com: + username: your_username + password: your_password ``` See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin. \ No newline at end of file diff --git a/docs/source/how_to/new_config_format.md b/docs/source/how_to/06_new_config_format.md similarity index 100% rename from docs/source/how_to/new_config_format.md rename to docs/source/how_to/06_new_config_format.md diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 189a7e6..9006e57 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -7,8 +7,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import url as UrlUtil, get_datetime_from_str from auto_archiver.core.extractor import Extractor - -from .dropin import GenericDropin, InfoExtractor +from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor class Twitter(GenericDropin): From d60d02c16ebfded91c9b6d88652de77c1227b1d1 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:50:31 +0100 Subject: [PATCH 05/11] improves download_from_url --- src/auto_archiver/core/extractor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index ca3359d..5dca928 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -77,6 +77,8 @@ class Extractor(BaseModule): downloads a URL to provided filename, or inferred from URL, returns local filename Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful. """ + if any(url.startswith(x) for x in ["blob:", "data:"]): + return None, url if try_best_quality else None if try_best_quality: with suppress(Exception): @@ -116,6 +118,8 @@ class Extractor(BaseModule): except requests.RequestException as e: logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}") + if try_best_quality: + return None, url @abstractmethod def download(self, item: Metadata) -> Metadata | False: From b60469767ace97d11c7b5c940779bbd57ecbeb8b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:51:22 +0100 Subject: [PATCH 06/11] more flexibility to antibot dropins media finding process --- .../antibot_extractor_enricher.py | 13 +++++------ .../antibot_extractor_enricher/dropin.py | 22 ++++++++++++++++++- .../dropins/default.py | 4 ---- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 1982389..04e4702 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher): self._enrich_download_media( sb, to_enrich, - css_selector=dropin.images_selectors(), + js_css_selector=dropin.js_for_image_css_selectors(), max_media=self.max_download_images - downloaded_images, ) self._enrich_download_media( sb, to_enrich, - css_selector=dropin.video_selectors(), + js_css_selector=dropin.js_for_video_css_selectors(), max_media=self.max_download_videos - downloaded_videos, ) logger.info(f"ANTIBOT completed for {url_sample}") @@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): to_enrich.add_media(Media(filename=pdf_filename), id="pdf") @logger.catch - def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int): + def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int): """ Downloads media from the page and adds them to the Metadata object. This method is called by the enrich method. @@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher): url = to_enrich.get_url() all_urls = set() - sources = sb.execute_script(f""" - return Array.from(document.querySelectorAll("{css_selector}")) - .map(el => el.src || el.href) - .filter(Boolean); - """) + sources = sb.execute_script(js_css_selector) + # js_for_css_selectors for src in sources: if len(all_urls) >= max_media: logger.debug(f"Reached max download limit of {max_media} images/videos.") diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 15c2e28..2e8c4f6 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -53,6 +53,26 @@ class Dropin: """ return "video, source" + def js_for_image_css_selectors(self) -> str: + """ + A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection. + + You can overwrite this instead of `images_selector` for more control over scraped images. + """ + return f""" + return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean); + """ + + def js_for_video_css_selectors(self) -> str: + """ + A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection. + + You can overwrite this instead of `video_selector` for more control over scraped videos. + """ + return f""" + return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean); + """ + def open_page(self, url) -> bool: """ Make sure the page is opened, even if it requires authentication, captcha solving, etc. @@ -66,7 +86,7 @@ class Dropin: Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object. :return: A tuple (number of Images added, number of Videos added). """ - raise NotImplementedError("This method should be implemented in the subclass") + return 0, 0 def _get_username_password(self, site) -> tuple[str, str]: """ diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py index c5c865a..72ec3f0 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py @@ -1,4 +1,3 @@ -from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -13,6 +12,3 @@ class DefaultDropin(Dropin): def open_page(self, url) -> bool: return True - - def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: - return 0, 0 From cd19181d8f6e8f69f09e4990c9a7a9f871ee3785 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:51:42 +0100 Subject: [PATCH 07/11] minor improvements --- .gitignore | 3 ++- src/auto_archiver/utils/url.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2c579fa..d14e3bb 100644 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,5 @@ docs/source/modules/autogen/ scripts/settings_page.html scripts/settings/src/schema.json .vite -downloaded_files \ No newline at end of file +downloaded_files +latest_logs \ No newline at end of file diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index a44a91d..79438bc 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -113,6 +113,8 @@ def is_relevant_url(url: str) -> bool: # reddit ("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners ("emoji.redditmedia.com",), + # linkedin + ("static.licdn.com",), ] # TODO: make these globally configurable From 2adcf231f79b98882d5365e9c4eb6cdd7f0af2ea Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:51:52 +0100 Subject: [PATCH 08/11] new LinkedIn Dropin for Antibot --- .../dropins/linkedin.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py new file mode 100644 index 0000000..3917af9 --- /dev/null +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py @@ -0,0 +1,59 @@ +from loguru import logger +from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin + + +class LinkedinDropin(Dropin): + """ + A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module. + """ + + notifications_css_selector = 'a[href*="linkedin.com/notifications"]' + + @staticmethod + def suitable(url: str) -> bool: + return "linkedin.com" in url + + def js_for_image_css_selectors(self) -> str: + get_all_css = "main img:not([src*='profile-displayphoto']):not([src*='profile-framedphoto'])" + get_first_css = ( + "main img[src*='profile-framedphoto'], main img[src*='profile-displayphoto'], main img[src*='company-logo']" + ) + + return f""" + const all = Array.from(document.querySelectorAll("{get_all_css}")).map(el => el.src || el.href).filter(Boolean); + const profile = document.querySelector("{get_first_css}"); + return all.concat(profile?.src || profile?.href || []).filter(Boolean); + """ + + @staticmethod + def video_selectors() -> str: + # usually videos are from blob: but running the generic extractor should handle that + return "main video" + + def open_page(self, url) -> bool: + if not self.sb.is_element_present(self.notifications_css_selector): + self._login() + if url != self.sb.get_current_url(): + self.sb.open(url) + return True + + @logger.catch + def _login(self) -> bool: + if self.sb.is_text_visible("Sign in to view more content"): + self.sb.click_link_text("Sign in", timeout=2) + self.sb.wait_for_ready_state_complete() + else: + self.sb.open("https://www.linkedin.com/login") + self.sb.wait_for_ready_state_complete() + + username, password = self._get_username_password("linkedin.com") + logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username) + self.sb.type("#username", username) + self.sb.type("#password", password) + self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5) + self.sb.click("button[type='submit']") + self.sb.wait_for_ready_state_complete() + # TODO: on suspicious login, LinkedIn may require an email verification code + + if not self.sb.is_element_present(self.notifications_css_selector): + self.sb.click_if_visible('button[aria-label="Dismiss"]', timeout=0.5) From f5be7a50c15240f4549bb599165e98d1cf548780 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:52:03 +0100 Subject: [PATCH 09/11] Testing Linkedin Dropin for Antibot --- tests/.env.test.example | 6 +++- .../test_antibot_extractor_enricher.py | 28 ++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/tests/.env.test.example b/tests/.env.test.example index 2e058ab..367d3f5 100644 --- a/tests/.env.test.example +++ b/tests/.env.test.example @@ -1,6 +1,10 @@ -# reddit test credentials +# ANTIBOT reddit test credentials REDDIT_TEST_USERNAME="" REDDIT_TEST_PASSWORD="" +# ANTIBOT linkedin test credentials +LINKEDIN_TEST_USERNAME="" +LINKEDIN_TEST_PASSWORD="" + # twitter test credentials TWITTER_BEARER_TOKEN="TEST_KEY" \ No newline at end of file diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 642d0ec..516357e 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -41,7 +41,11 @@ class TestAntibotExtractorEnricher(TestExtractorBase): "reddit.com": { "username": os.environ.get("REDDIT_TEST_USERNAME"), "password": os.environ.get("REDDIT_TEST_PASSWORD"), - } + }, + "linkedin.com": { + "username": os.environ.get("LINKEDIN_TEST_USERNAME"), + "password": os.environ.get("LINKEDIN_TEST_PASSWORD"), + }, }, } @@ -148,6 +152,28 @@ class TestAntibotExtractorEnricher(TestExtractorBase): ): self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count) + @pytest.mark.skipif( + not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"), + reason="No Reddit test credentials provided", + ) + @pytest.mark.download + @pytest.mark.parametrize( + "url,in_title,in_text,image_count,video_count", + [ + ( + "https://www.linkedin.com/posts/bellingcat_live-podcast-bellingcat-activity-7331725631799398400-xocM/", + "Post", + "It takes time to go from hunch to reporting...", + 2, + 0, + ), + ], + ) + def test_linkedin_download_with_login( + self, setup_module, make_item, url, in_title, in_text, image_count, video_count + ): + self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count) + @pytest.mark.download @pytest.mark.parametrize( "url,in_html", From aaa9ead39d267ae07ce4bb9e1c3f4e1479e72805 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:58:53 +0100 Subject: [PATCH 10/11] adds documentation for dropins --- docs/scripts/scripts.py | 22 ++++++++++++++++++- .../__manifest__.py | 9 ++++++-- .../antibot_extractor_enricher/dropin.py | 14 ++++++++++++ .../dropins/linkedin.py | 15 +++++++++++++ .../dropins/reddit.py | 14 ++++++++++++ .../antibot_extractor_enricher/dropins/vk.py | 16 +++++++++++++- 6 files changed, 86 insertions(+), 4 deletions(-) diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index bfddd29..f9cb13d 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -47,7 +47,6 @@ def generate_module_docs(): for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)): # generate the markdown file from the __manifest__.py file. - manifest = module.manifest for type in manifest["type"]: modules_by_type.setdefault(type, []).append(module) @@ -64,6 +63,27 @@ def generate_module_docs(): """ steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"]) + if manifest.get("autodoc_dropins"): + loaded_module = module.load({}) + dropins = loaded_module.load_dropins() + dropin_str = "\n##### Available Dropins\n" + for dropin in dropins: + if not (ddoc := dropin.documentation()): + continue + dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n" + dropin_str += f"{ddoc.get('description')}\n\n" + if ddoc.get("site"): + dropin_str += f"**Site**: {ddoc['site']}\n\n" + if dauth := ddoc.get("authentication"): + dropin_str += "**YAML configuration**:\n" + dropin_auth_yaml = "authentication:\n...\n" + for site, creds in dauth.items(): + dropin_auth_yaml += f" {site}:\n" + for k, v in creds.items(): + dropin_auth_yaml += f' {k}: "{v}"\n' + dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n" + readme_str += dropin_str + if not manifest["configs"]: config_string = f"# No configuration options for {module.name}.*\n" else: diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index e2bcad9..f08547b 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -31,11 +31,12 @@ "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'", }, }, + "autodoc_dropins": True, "description": """ Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha. - Still in trial development, please report any issues or suggestions via GitHub Issues. - + > ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues). + ### Features - Extracts the HTML source code of the page. - Takes full-page screenshots of web pages. @@ -44,5 +45,9 @@ ### Notes - Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary. + + ### Dropins + This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below. + """, } diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 2e8c4f6..d4b255d 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,4 +1,5 @@ import os +from typing import Mapping from loguru import logger from seleniumbase import SB import yt_dlp @@ -13,6 +14,19 @@ class Dropin: This class is designed to be a base class for drop-ins that can handle specific websites. """ + @staticmethod + def documentation() -> Mapping[str, str]: + """ + Each Dropin should auto-document itself with this method. + Return dictionary can include: + - 'name': A string representing the name of the dropin. + - 'description': A string describing the functionality of the dropin. + - 'site': A string representing the site this dropin is for. + - 'authentication': A dictionary with authentication example for the site. + + """ + return {} + def __init__(self, sb: SB, extractor: Extractor): """ Initialize the Dropin with the given SeleniumBase instance. diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py index 3917af9..336b630 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py @@ -1,3 +1,4 @@ +from typing import Mapping from loguru import logger from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -7,6 +8,20 @@ class LinkedinDropin(Dropin): A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module. """ + @staticmethod + def documentation() -> Mapping[str, str]: + return { + "name": "Linkedin Dropin", + "description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.", + "site": "linkedin.com", + "authentication": { + "linkedin.com": { + "username": "email address or phone number", + "password": "password", + } + }, + } + notifications_css_selector = 'a[href*="linkedin.com/notifications"]' @staticmethod diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 78bc510..3f699b6 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -1,4 +1,5 @@ from contextlib import suppress +from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -10,6 +11,19 @@ class RedditDropin(Dropin): A class to handle Reddit drop-in functionality for the antibot extractor enricher module. """ + def documentation() -> Mapping[str, str]: + return { + "name": "Reddit Dropin", + "description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.", + "site": "reddit.com", + "authentication": { + "reddit.com": { + "username": "email address or username", + "password": "password", + } + }, + } + @staticmethod def suitable(url: str) -> bool: return "reddit.com" in url diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 6888727..3f92eda 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -1,4 +1,5 @@ import re +from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -16,6 +17,19 @@ class VkDropin(Dropin): CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") + def documentation() -> Mapping[str, str]: + return { + "name": "VKontakte Dropin", + "description": "Handles VKontakte posts and works without authentication for some content.", + "site": "vk.com", + "authentication": { + "vk.com": { + "username": "phone number with country code", + "password": "password", + } + }, + } + @staticmethod def suitable(url: str) -> bool: return "vk.com" in url @@ -39,7 +53,7 @@ class VkDropin(Dropin): @logger.catch def _login(self) -> bool: - # TODO: test method + # TODO: test method, because current tests work without a login self.sb.open("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url(): From d7a48e465be2d2ea4bb2b91a091d1af49d98a4b9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 18:04:49 +0100 Subject: [PATCH 11/11] fix copypasta --- tests/extractors/test_antibot_extractor_enricher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 516357e..a8a51dd 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -153,8 +153,8 @@ class TestAntibotExtractorEnricher(TestExtractorBase): self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count) @pytest.mark.skipif( - not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"), - reason="No Reddit test credentials provided", + not os.environ.get("LINKEDIN_TEST_USERNAME") or not os.environ.get("LINKEDIN_TEST_PASSWORD"), + reason="No LinkedIn test credentials provided", ) @pytest.mark.download @pytest.mark.parametrize(