Merge pull request #319 from bellingcat/feat/linkedin-antibot

Antibot Dropin for Linkedin
2025-06-11 19:42:38 +01:00 · 2025-06-11 19:42:38 +01:00 · 3d31c7605b
commit 3d31c7605b
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 tmp*/
 temp/
 .env*
+!.env*.example
 .DS_Store
 expmt/
 service_account.json
@ -37,4 +38,5 @@ docs/source/modules/autogen/
 scripts/settings_page.html
 scripts/settings/src/schema.json 
 .vite
-downloaded_files
+downloaded_files
+latest_logs
--- a/docs/scripts/scripts.py
+++ b/docs/scripts/scripts.py
@ -47,7 +47,6 @@ def generate_module_docs():

    for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
        # generate the markdown file from the __manifest__.py file.
-
        manifest = module.manifest
        for type in manifest["type"]:
            modules_by_type.setdefault(type, []).append(module)
@ -64,6 +63,27 @@ def generate_module_docs():
 """
        steps_str = "\n".join(f"  {t}s:\n  - {module.name}" for t in manifest["type"])

+        if manifest.get("autodoc_dropins"):
+            loaded_module = module.load({})
+            dropins = loaded_module.load_dropins()
+            dropin_str = "\n##### Available Dropins\n"
+            for dropin in dropins:
+                if not (ddoc := dropin.documentation()):
+                    continue
+                dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n"
+                dropin_str += f"{ddoc.get('description')}\n\n"
+                if ddoc.get("site"):
+                    dropin_str += f"**Site**: {ddoc['site']}\n\n"
+                if dauth := ddoc.get("authentication"):
+                    dropin_str += "**YAML configuration**:\n"
+                    dropin_auth_yaml = "authentication:\n...\n"
+                    for site, creds in dauth.items():
+                        dropin_auth_yaml += f"  {site}:\n"
+                        for k, v in creds.items():
+                            dropin_auth_yaml += f'    {k}: "{v}"\n'
+                    dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n"
+            readme_str += dropin_str
+
        if not manifest["configs"]:
            config_string = f"# No configuration options for {module.name}.*\n"
        else:
--- a/docs/source/development/testing.md
+++ b/docs/source/development/testing.md
@ -3,14 +3,14 @@
 `pytest` is used for testing. There are two main types of tests:

 1. 'core' tests which should be run on every change
-2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed.
+2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed, they take longer.


 ## Running Tests 

 1. Make sure you've installed the dev dependencies with `pytest install --with dev`
 2. Tests can be run as follows:
-```
+```{code} bash
 #### Command prefix of 'poetry run' removed here for simplicity
 # run core tests
 pytest -ra -v -m "not download"
@ -18,4 +18,15 @@ pytest -ra -v -m "not download"
 pytest -ra -v -m "download"
 # run all tests
 pytest -ra -v
+
+
+# run a specific test file
+pytest -ra -v tests/test_file.py
+# run a specific test function
+pytest -ra -v tests/test_file.py::test_function_name
+```
+
+3. Some tests require environment variables to be set. You can use the example `.env.test.example` file as a template. Copy it to `.env.test` and fill in the required values. This file will be loaded automatically by `pytest`.
+```{code} bash
+cp .env.test.example .env.test
 ```
--- a/docs/source/how_to/01_authentication_how_to.md
+++ b/docs/source/how_to/01_authentication_how_to.md
--- a/docs/source/how_to/02_gsheets_setup.md
+++ b/docs/source/how_to/02_gsheets_setup.md
--- a/docs/source/how_to/03_logging.md
+++ b/docs/source/how_to/03_logging.md
--- a/docs/source/how_to/04_run_instagrapi_server.md
+++ b/docs/source/how_to/04_run_instagrapi_server.md
--- a/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md
+++ b/docs/source/how_to/upgrading_1_0_1_to_1_1_0.md
@ -15,19 +15,29 @@ We have dropped the `vk_extractor` because of problems in a project we relied on
 Module 'vk_extractor' not found. Are you sure it's installed/exists?
 ```

+## Dropping `screenshot_enricher` module
+We have dropped the `screenshot_enricher` module because a new `antibot_extractor_enricher` (see below) module replaces its functionality more robustly and with less dependency hassle on geckodriver/firefox. You will need to remove it from your configuration file, otherwise you will see an error like:
+
+```{code} console
+Module 'screenshot_enricher' not found. Are you sure it's installed/exists?
+```
+
+
 ## New `antibot_extractor_enricher` module and VkDropin
-We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:
+We have added a new [`antibot_extractor_enricher`](../modules/autogen/extractor/antibot_extractor_enricher.md) module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:

 ```{code} yaml
 steps:
-	extractors:
-		- antibot_extractor_enricher
+  extractors:
+    - antibot_extractor_enricher

-	# or alternatively, if you want to use it as an enricher:
-	enrichers:
-		- antibot_extractor_enricher
+  # or alternatively, if you want to use it as an enricher:
+  enrichers:
+    - antibot_extractor_enricher
 ```

+It will take a full page screenshot, a PDF capture, extract HTML source code, and any other relevant media. 
+
 It comes with Dropins that we will be adding and maintaining. 

 > Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though.
@ -36,9 +46,9 @@ One such Dropin is the VkDropin which uses this automated browser to access VKon

 ```{code} yaml
 authentication:
-  vk:
-	username: your_username
-	password: your_password
+  vk.com:
+  username: your_username
+  password: your_password
 ```

 See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin.
--- a/docs/source/how_to/06_new_config_format.md
+++ b/docs/source/how_to/06_new_config_format.md
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@ -77,6 +77,8 @@ class Extractor(BaseModule):
        downloads a URL to provided filename, or inferred from URL, returns local filename
        Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful.
        """
+        if any(url.startswith(x) for x in ["blob:", "data:"]):
+            return None, url if try_best_quality else None

        if try_best_quality:
            with suppress(Exception):
@ -116,6 +118,8 @@ class Extractor(BaseModule):

        except requests.RequestException as e:
            logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
+        if try_best_quality:
+            return None, url

    @abstractmethod
    def download(self, item: Metadata) -> Metadata | False:
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -34,7 +34,7 @@ from .config import (
 from .module import ModuleFactory, LazyBaseModule
 from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .consts import MODULE_TYPES, SetupError
-from auto_archiver.utils.url import check_url_or_raise
+from auto_archiver.utils.url import check_url_or_raise, clean

 if TYPE_CHECKING:
    from .base_module import BaseModule
@ -572,7 +572,7 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            raise e

        # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
-        url = original_url
+        url = clean(original_url)
        for a in self.extractors:
            url = a.sanitize_url(url)

--- a/src/auto_archiver/modules/antibot_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/manifest.py
@ -31,11 +31,12 @@
            "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
        },
    },
+    "autodoc_dropins": True,
    "description": """
    Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
 	
-	Still in trial development, please report any issues or suggestions via GitHub Issues.
-
+	> ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues).
+	
    ### Features
 	- Extracts the HTML source code of the page.
    - Takes full-page screenshots of web pages.
@ -44,5 +45,9 @@

    ### Notes
 	- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
+
+	### Dropins
+	This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below.
+
    """,
 }
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
                self._enrich_download_media(
                    sb,
                    to_enrich,
-                    css_selector=dropin.images_selectors(),
+                    js_css_selector=dropin.js_for_image_css_selectors(),
                    max_media=self.max_download_images - downloaded_images,
                )
                self._enrich_download_media(
                    sb,
                    to_enrich,
-                    css_selector=dropin.video_selectors(),
+                    js_css_selector=dropin.js_for_video_css_selectors(),
                    max_media=self.max_download_videos - downloaded_videos,
                )
                logger.info(f"ANTIBOT completed for {url_sample}")
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
        to_enrich.add_media(Media(filename=pdf_filename), id="pdf")

    @logger.catch
-    def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
+    def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
        """
        Downloads media from the page and adds them to the Metadata object.
        This method is called by the enrich method.
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
        url = to_enrich.get_url()
        all_urls = set()

-        sources = sb.execute_script(f"""
-            return Array.from(document.querySelectorAll("{css_selector}"))
-                    .map(el => el.src || el.href)
-                    .filter(Boolean);
-        """)
+        sources = sb.execute_script(js_css_selector)
+        # js_for_css_selectors
        for src in sources:
            if len(all_urls) >= max_media:
                logger.debug(f"Reached max download limit of {max_media} images/videos.")
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@ -1,4 +1,5 @@
 import os
+from typing import Mapping
 from loguru import logger
 from seleniumbase import SB
 import yt_dlp
@ -13,6 +14,19 @@ class Dropin:
    This class is designed to be a base class for drop-ins that can handle specific websites.
    """

+    @staticmethod
+    def documentation() -> Mapping[str, str]:
+        """
+        Each Dropin should auto-document itself with this method.
+        Return dictionary can include:
+        - 'name': A string representing the name of the dropin.
+        - 'description': A string describing the functionality of the dropin.
+        - 'site': A string representing the site this dropin is for.
+        - 'authentication': A dictionary with authentication example for the site.
+
+        """
+        return {}
+
    def __init__(self, sb: SB, extractor: Extractor):
        """
        Initialize the Dropin with the given SeleniumBase instance.
@ -53,6 +67,26 @@ class Dropin:
        """
        return "video, source"

+    def js_for_image_css_selectors(self) -> str:
+        """
+        A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
+
+        You can overwrite this instead of `images_selector` for more control over scraped images.
+        """
+        return f"""
+            return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+        """
+
+    def js_for_video_css_selectors(self) -> str:
+        """
+        A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
+
+        You can overwrite this instead of `video_selector` for more control over scraped videos.
+        """
+        return f"""
+            return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+        """
+
    def open_page(self, url) -> bool:
        """
        Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@ -66,7 +100,7 @@ class Dropin:
        Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
        :return: A tuple (number of Images added, number of Videos added).
        """
-        raise NotImplementedError("This method should be implemented in the subclass")
+        return 0, 0

    def _get_username_password(self, site) -> tuple[str, str]:
        """
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py
@ -1,4 +1,3 @@
-from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin


@ -13,6 +12,3 @@ class DefaultDropin(Dropin):

    def open_page(self, url) -> bool:
        return True
-
-    def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
-        return 0, 0
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
@ -0,0 +1,74 @@
+from typing import Mapping
+from loguru import logger
+from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
+
+
+class LinkedinDropin(Dropin):
+    """
+    A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module.
+    """
+
+    @staticmethod
+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "Linkedin Dropin",
+            "description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.",
+            "site": "linkedin.com",
+            "authentication": {
+                "linkedin.com": {
+                    "username": "email address or phone number",
+                    "password": "password",
+                }
+            },
+        }
+
+    notifications_css_selector = 'a[href*="linkedin.com/notifications"]'
+
+    @staticmethod
+    def suitable(url: str) -> bool:
+        return "linkedin.com" in url
+
+    def js_for_image_css_selectors(self) -> str:
+        get_all_css = "main img:not([src*='profile-displayphoto']):not([src*='profile-framedphoto'])"
+        get_first_css = (
+            "main img[src*='profile-framedphoto'], main img[src*='profile-displayphoto'], main img[src*='company-logo']"
+        )
+
+        return f"""
+            const all = Array.from(document.querySelectorAll("{get_all_css}")).map(el => el.src || el.href).filter(Boolean);
+            const profile = document.querySelector("{get_first_css}");
+            return all.concat(profile?.src || profile?.href || []).filter(Boolean);
+        """
+
+    @staticmethod
+    def video_selectors() -> str:
+        # usually videos are from blob: but running the generic extractor should handle that
+        return "main video"
+
+    def open_page(self, url) -> bool:
+        if not self.sb.is_element_present(self.notifications_css_selector):
+            self._login()
+            if url != self.sb.get_current_url():
+                self.sb.open(url)
+        return True
+
+    @logger.catch
+    def _login(self) -> bool:
+        if self.sb.is_text_visible("Sign in to view more content"):
+            self.sb.click_link_text("Sign in", timeout=2)
+            self.sb.wait_for_ready_state_complete()
+        else:
+            self.sb.open("https://www.linkedin.com/login")
+            self.sb.wait_for_ready_state_complete()
+
+        username, password = self._get_username_password("linkedin.com")
+        logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username)
+        self.sb.type("#username", username)
+        self.sb.type("#password", password)
+        self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
+        self.sb.click("button[type='submit']")
+        self.sb.wait_for_ready_state_complete()
+        # TODO: on suspicious login, LinkedIn may require an email verification code
+
+        if not self.sb.is_element_present(self.notifications_css_selector):
+            self.sb.click_if_visible('button[aria-label="Dismiss"]', timeout=0.5)
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
@ -1,4 +1,5 @@
 from contextlib import suppress
+from typing import Mapping
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin

@ -10,6 +11,19 @@ class RedditDropin(Dropin):
    A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
    """

+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "Reddit Dropin",
+            "description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.",
+            "site": "reddit.com",
+            "authentication": {
+                "reddit.com": {
+                    "username": "email address or username",
+                    "password": "password",
+                }
+            },
+        }
+
    @staticmethod
    def suitable(url: str) -> bool:
        return "reddit.com" in url
@ -36,7 +50,7 @@ class RedditDropin(Dropin):
        self._close_cookies_banner()

        username, password = self._get_username_password("reddit.com")
-        logger.debug("RedditDropin Logging in to VK with username: {}", username)
+        logger.debug("RedditDropin Logging in to Reddit with username: {}", username)

        self.sb.type("#login-username", username)
        self.sb.type("#login-password", password)
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
@ -1,4 +1,5 @@
 import re
+from typing import Mapping

 from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@ -16,6 +17,19 @@ class VkDropin(Dropin):
    CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
    PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")

+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "VKontakte Dropin",
+            "description": "Handles VKontakte posts and works without authentication for some content.",
+            "site": "vk.com",
+            "authentication": {
+                "vk.com": {
+                    "username": "phone number with country code",
+                    "password": "password",
+                }
+            },
+        }
+
    @staticmethod
    def suitable(url: str) -> bool:
        return "vk.com" in url
@ -39,7 +53,7 @@ class VkDropin(Dropin):

    @logger.catch
    def _login(self) -> bool:
-        # TODO: test method
+        # TODO: test method, because current tests work without a login
        self.sb.open("https://vk.com")
        self.sb.wait_for_ready_state_complete()
        if "/feed" in self.sb.get_current_url():
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@ -7,8 +7,7 @@ from slugify import slugify
 from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
 from auto_archiver.core.extractor import Extractor
-
-from .dropin import GenericDropin, InfoExtractor
+from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor


 class Twitter(GenericDropin):
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@ -1,5 +1,5 @@
 import re
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
 from ipaddress import ip_address


@ -53,7 +53,11 @@ def domain_for_url(url: str) -> str:


 def clean(url: str) -> str:
-    return url
+    TRACKERS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"}
+
+    parsed = urlparse(url)
+    clean_qs = [(k, v) for k, v in parse_qsl(parsed.query) if k not in TRACKERS]
+    return parsed._replace(query=urlencode(clean_qs)).geturl()


 def is_auth_wall(url: str) -> bool:
@ -109,6 +113,8 @@ def is_relevant_url(url: str) -> bool:
        # reddit
        ("styles.redditmedia.com",),  # opinionated but excludes may irrelevant images like avatars and banners
        ("emoji.redditmedia.com",),
+        # linkedin
+        ("static.licdn.com",),
    ]

    # TODO: make these globally configurable
--- a/tests/.env.test.example
+++ b/tests/.env.test.example
@ -0,0 +1,10 @@
+# ANTIBOT reddit test credentials
+REDDIT_TEST_USERNAME=""
+REDDIT_TEST_PASSWORD=""
+
+# ANTIBOT linkedin test credentials
+LINKEDIN_TEST_USERNAME=""
+LINKEDIN_TEST_PASSWORD=""
+
+# twitter test credentials
+TWITTER_BEARER_TOKEN="TEST_KEY"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
 from typing import Dict, Tuple
 import hashlib

+from loguru import logger
 import pytest
 from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.core.module import ModuleFactory
@ -20,6 +21,24 @@ from auto_archiver.core.module import ModuleFactory
 TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]


+def pytest_configure():
+    # load environment variables from .env.test file.
+    env_path = os.path.join(os.path.dirname(__file__), ".env.test")
+    if os.path.exists(env_path):
+        with open(env_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if "=" in line:
+                    key, value = line.split("=", 1)
+                    os.environ[key.strip()] = value.strip().lstrip('"').rstrip('"')
+    else:
+        logger.warning(
+            f"Environment file {env_path} not found. Skipping loading environment variables, some tests may fail."
+        )
+
+
 # don't check for ytdlp updates in tests
@pytest.fixture(autouse=True)
 def skip_check_for_update(mocker):
--- a/tests/extractors/test_antibot_extractor_enricher.py
+++ b/tests/extractors/test_antibot_extractor_enricher.py
@ -1,3 +1,4 @@
+import os
 import pytest

 from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher
@ -34,7 +35,18 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
        "save_to_pdf": False,
        "max_download_images": 0,
        "max_download_videos": 0,
+        "user_data_dir": "./tests/tmp/user_data",
        "proxy": None,
+        "authentication": {
+            "reddit.com": {
+                "username": os.environ.get("REDDIT_TEST_USERNAME"),
+                "password": os.environ.get("REDDIT_TEST_PASSWORD"),
+            },
+            "linkedin.com": {
+                "username": os.environ.get("LINKEDIN_TEST_USERNAME"),
+                "password": os.environ.get("LINKEDIN_TEST_PASSWORD"),
+            },
+        },
    }

    @pytest.mark.download
@ -82,10 +94,10 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
        """
        Test downloading pages with media.
        """
-
        self.extractor = setup_module(
            self.extractor_module,
-            {
+            self.config
+            | {
                "save_to_pdf": True,
                "max_download_images": 5,
                "max_download_videos": "inf",
@ -118,6 +130,50 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
                f"Expected media with id '{expected_id}' not found"
            )

+    @pytest.mark.skipif(
+        not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"),
+        reason="No Reddit test credentials provided",
+    )
+    @pytest.mark.download
+    @pytest.mark.parametrize(
+        "url,in_title,in_text,image_count,video_count",
+        [
+            (
+                "https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/",
+                "Duy tran is the owner and prime wood work artist",
+                " Created Jan 26, 2015",
+                4,
+                0,
+            ),
+        ],
+    )
+    def test_reddit_download_with_login(
+        self, setup_module, make_item, url, in_title, in_text, image_count, video_count
+    ):
+        self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
+
+    @pytest.mark.skipif(
+        not os.environ.get("LINKEDIN_TEST_USERNAME") or not os.environ.get("LINKEDIN_TEST_PASSWORD"),
+        reason="No LinkedIn test credentials provided",
+    )
+    @pytest.mark.download
+    @pytest.mark.parametrize(
+        "url,in_title,in_text,image_count,video_count",
+        [
+            (
+                "https://www.linkedin.com/posts/bellingcat_live-podcast-bellingcat-activity-7331725631799398400-xocM/",
+                "Post",
+                "It takes time to go from hunch to reporting...",
+                2,
+                0,
+            ),
+        ],
+    )
+    def test_linkedin_download_with_login(
+        self, setup_module, make_item, url, in_title, in_text, image_count, video_count
+    ):
+        self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
+
    @pytest.mark.download
    @pytest.mark.parametrize(
        "url,in_html",
--- a/tests/utils/test_urls.py
+++ b/tests/utils/test_urls.py
@ -1,5 +1,6 @@
 import pytest
 from auto_archiver.utils.url import (
+    clean,
    is_auth_wall,
    check_url_or_raise,
    domain_for_url,
@ -158,3 +159,39 @@ def test_twitter_best_quality_url(url, best_quality):
 )
 def test_get_media_url_best_quality(input_url, expected_url):
    assert get_media_url_best_quality(input_url) == expected_url
+
+
+@pytest.mark.parametrize(
+    "input_url,expected_url",
+    [
+        # No trackers present
+        ("https://example.com/page?foo=bar&baz=qux", "https://example.com/page?foo=bar&baz=qux"),
+        # Single tracker present
+        ("https://example.com/page?utm_source=google&foo=bar", "https://example.com/page?foo=bar"),
+        # Multiple trackers present
+        ("https://example.com/page?utm_source=google&utm_medium=email&utm_campaign=spring", "https://example.com/page"),
+        # Trackers mixed with other params
+        (
+            "https://example.com/page?foo=bar&utm_content=abc&baz=qux&gclid=123",
+            "https://example.com/page?foo=bar&baz=qux",
+        ),
+        # Only trackers present
+        ("https://example.com/page?utm_source=google&gclid=123", "https://example.com/page"),
+        # No query string
+        ("https://example.com/page", "https://example.com/page"),
+        # Trackers in fragment (should not be removed)
+        ("https://example.com/page#utm_source=google", "https://example.com/page#utm_source=google"),
+        # Trackers after fragment
+        ("https://example.com/page?utm_source=google#section-1", "https://example.com/page#section-1"),
+        # Trackers with empty value
+        ("https://example.com/page?utm_source=&foo=bar", "https://example.com/page?foo=bar"),
+        # Trackers with multiple values
+        ("https://example.com/page?utm_source=google&utm_source=bing&foo=bar", "https://example.com/page?foo=bar"),
+        # Trackers with encoded values
+        ("https://example.com/page?utm_source=google%20ads&foo=bar", "https://example.com/page?foo=bar"),
+        # Unrelated param with similar name
+        ("https://example.com/page?utm_sourc=keepme&foo=bar", "https://example.com/page?utm_sourc=keepme&foo=bar"),
+    ],
+)
+def test_clean_removes_trackers(input_url, expected_url):
+    assert clean(input_url) == expected_url