dropping screenshot enricher

2025-06-04 12:08:59 +01:00 · 2025-06-04 12:08:59 +01:00 · e6f3826a3a
commit e6f3826a3a
--- a/plugin.zip
+++ b/plugin.zip
--- a/poetry.lock
+++ b/poetry.lock
@ -4154,4 +4154,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "f142f1b7907266898024fbab926401f52bc4b281aef5f52e96382ce21afca1d1"
+content-hash = "1ab1e4c9b8beb51116052c1e8d180616a0938757f173f05b7355e279902d3350"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -27,7 +27,6 @@ dependencies = [
    "bs4 (>=0.0.0)",
    "loguru (>=0.0.0)",
    "ffmpeg-python (>=0.0.0)",
-    "selenium (>=0.0.0)",
    "telethon (>=0.0.0)",
    "google-api-python-client (>=0.0.0)",
    "google-auth-httplib2 (>=0.0.0)",
--- a/src/auto_archiver/modules/screenshot_enricher/init.py
+++ b/src/auto_archiver/modules/screenshot_enricher/init.py
@ -1 +0,0 @@
-from .screenshot_enricher import ScreenshotEnricher
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@ -1,44 +0,0 @@
-{
-    "name": "Screenshot Enricher",
-    "type": ["enricher"],
-    "requires_setup": True,
-    "dependencies": {
-        "python": ["loguru", "selenium"],
-    },
-    "configs": {
-        "width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
-        "height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
-        "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
-        "sleep_before_screenshot": {
-            "default": 4,
-            "type": "int",
-            "help": "seconds to wait for the pages to load before taking screenshot",
-        },
-        "http_proxy": {
-            "default": "",
-            "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
-        },
-        "save_to_pdf": {
-            "default": False,
-            "type": "bool",
-            "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
-        },
-        "print_options": {
-            "default": {},
-            "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
-            "type": "json_loader",
-        },
-    },
-    "description": """
-    Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
-
-    ### Features
-    - Takes screenshots of web pages, with configurable width, height, and timeout settings.
-    - Optionally saves pages as PDFs, with additional configuration for PDF printing options.
-    - Bypasses URLs detected as authentication walls.
-    - Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media.
-
-    ### Notes
-    - Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
-    """,
-}
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@ -1,61 +0,0 @@
-from loguru import logger
-import time
-import os
-import base64
-
-from selenium.common.exceptions import TimeoutException
-
-
-from auto_archiver.core import Enricher
-from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
-from auto_archiver.core import Media, Metadata
-
-
-class ScreenshotEnricher(Enricher):
-    def __init__(self, webdriver_factory=None):
-        super().__init__()
-        self.webdriver_factory = webdriver_factory or Webdriver
-
-    def enrich(self, to_enrich: Metadata) -> None:
-        url = to_enrich.get_url()
-
-        logger.debug(f"Enriching screenshot for {url=}")
-        auth = self.auth_for_site(url)
-
-        # screenshot enricher only supports cookie-type auth (selenium)
-        has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
-
-        if UrlUtil.is_auth_wall(url) and not has_valid_auth:
-            logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
-            if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
-                logger.warning(
-                    f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
-                               Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
-                )
-            return
-
-        with self.webdriver_factory(
-            self.width,
-            self.height,
-            self.timeout,
-            facebook_accept_cookies="facebook.com" in url,
-            http_proxy=self.http_proxy,
-            print_options=self.print_options,
-            auth=auth,
-        ) as driver:
-            try:
-                driver.get(url)
-                time.sleep(int(self.sleep_before_screenshot))
-                screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
-                driver.save_screenshot(screenshot_file)
-                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
-                if self.save_to_pdf:
-                    pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
-                    pdf = driver.print_page(driver.print_options)
-                    with open(pdf_file, "wb") as f:
-                        f.write(base64.b64decode(pdf))
-                    to_enrich.add_media(Media(filename=pdf_file), id="pdf")
-            except TimeoutException:
-                logger.info("TimeoutException loading page for screenshot")
-            except Exception as e:
-                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
				`@ -1 +0,0 @@`
				`from .screenshot_enricher import ScreenshotEnricher`