dropping screenshot enricher

pull/311/head
msramalho 2025-06-04 12:08:59 +01:00
rodzic e5a78a5d06
commit e6f3826a3a
Nie znaleziono w bazie danych klucza dla tego podpisu
6 zmienionych plików z 1 dodań i 108 usunięć

Plik binarny nie jest wyświetlany.

2
poetry.lock wygenerowano
Wyświetl plik

@ -4154,4 +4154,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "f142f1b7907266898024fbab926401f52bc4b281aef5f52e96382ce21afca1d1"
content-hash = "1ab1e4c9b8beb51116052c1e8d180616a0938757f173f05b7355e279902d3350"

Wyświetl plik

@ -27,7 +27,6 @@ dependencies = [
"bs4 (>=0.0.0)",
"loguru (>=0.0.0)",
"ffmpeg-python (>=0.0.0)",
"selenium (>=0.0.0)",
"telethon (>=0.0.0)",
"google-api-python-client (>=0.0.0)",
"google-auth-httplib2 (>=0.0.0)",

Wyświetl plik

@ -1 +0,0 @@
from .screenshot_enricher import ScreenshotEnricher

Wyświetl plik

@ -1,44 +0,0 @@
{
"name": "Screenshot Enricher",
"type": ["enricher"],
"requires_setup": True,
"dependencies": {
"python": ["loguru", "selenium"],
},
"configs": {
"width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
"height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
"timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {
"default": 4,
"type": "int",
"help": "seconds to wait for the pages to load before taking screenshot",
},
"http_proxy": {
"default": "",
"help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
},
"save_to_pdf": {
"default": False,
"type": "bool",
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
},
"print_options": {
"default": {},
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
"type": "json_loader",
},
},
"description": """
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
### Features
- Takes screenshots of web pages, with configurable width, height, and timeout settings.
- Optionally saves pages as PDFs, with additional configuration for PDF printing options.
- Bypasses URLs detected as authentication walls.
- Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media.
### Notes
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
""",
}

Wyświetl plik

@ -1,61 +0,0 @@
from loguru import logger
import time
import os
import base64
from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Enricher
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
def __init__(self, webdriver_factory=None):
super().__init__()
self.webdriver_factory = webdriver_factory or Webdriver
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url)
# screenshot enricher only supports cookie-type auth (selenium)
has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
if UrlUtil.is_auth_wall(url) and not has_valid_auth:
logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
logger.warning(
f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
)
return
with self.webdriver_factory(
self.width,
self.height,
self.timeout,
facebook_accept_cookies="facebook.com" in url,
http_proxy=self.http_proxy,
print_options=self.print_options,
auth=auth,
) as driver:
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
if self.save_to_pdf:
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
pdf = driver.print_page(driver.print_options)
with open(pdf_file, "wb") as f:
f.write(base64.b64decode(pdf))
to_enrich.add_media(Media(filename=pdf_file), id="pdf")
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")