From 663c8ad93ad97308ea82e6aff28e0e30d67fe098 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 18 Dec 2024 13:37:44 +0300 Subject: [PATCH 1/2] Add 'print_pdf' option to the screenshot enricher. Fixes #132 --- example.orchestration.yaml | 16 ++++++++++++++++ .../enrichers/screenshot_enricher.py | 13 ++++++++++++- src/auto_archiver/utils/webdriver.py | 9 ++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/example.orchestration.yaml b/example.orchestration.yaml index a36b125..23a950b 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -97,6 +97,22 @@ configurations: screenshot_enricher: width: 1280 height: 2300 + # to save as pdf, uncomment the following lines and adjust the print options + # save_to_pdf: true + # print_options: + # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html + # background: true + # orientation: "portrait" + # scale: 1 + # page_width: 8.5in + # page_height: 11in + # margin_top: 0.4in + # margin_bottom: 0.4in + # margin_left: 0.4in + # margin_right: 0.4in + # page_ranges: "" + # shrink_to_fit: true + wayback_archiver_enricher: timeout: 10 key: "wayback key" diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index 69f466b..b2ef096 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -1,5 +1,7 @@ from loguru import logger import time, os +import base64 + from selenium.common.exceptions import TimeoutException @@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher): "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + "print_options": {"default": {}, "help": "options to pass to the pdf printer"} } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") return logger.debug(f"Enriching screenshot for {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver: try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") driver.save_screenshot(screenshot_file) to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") + if self.save_to_pdf: + pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf") + pdf = driver.print_page(driver.print_options) + with open(pdf_file, "wb") as f: + f.write(base64.b64decode(pdf)) + to_enrich.add_media(Media(filename=pdf_file), id="pdf") except TimeoutException: logger.info("TimeoutException loading page for screenshot") except Exception as e: diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index dc21e17..7e95330 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -2,18 +2,24 @@ from __future__ import annotations from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.proxy import Proxy, ProxyType +from selenium.webdriver.common.print_page_options import PrintOptions + from loguru import logger from selenium.webdriver.common.by import By import time class Webdriver: - def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "") -> webdriver: + def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver: self.width = width self.height = height self.timeout_seconds = timeout_seconds self.facebook_accept_cookies = facebook_accept_cookies self.http_proxy = http_proxy + # create and set print options + self.print_options = PrintOptions() + for k, v in print_options.items(): + setattr(self.print_options, k, v) def __enter__(self) -> webdriver: options = webdriver.FirefoxOptions() @@ -24,6 +30,7 @@ class Webdriver: self.driver = webdriver.Firefox(options=options) self.driver.set_window_size(self.width, self.height) self.driver.set_page_load_timeout(self.timeout_seconds) + self.driver.print_options = self.print_options except TimeoutException as e: logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") From 83da9ae08990d8c800a166b5beb873b4dca6c409 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 23 Dec 2024 18:19:26 +0000 Subject: [PATCH 2/2] adds pdf preview support for html formatter --- src/auto_archiver/formatters/templates/html_template.html | 6 +++--- src/auto_archiver/formatters/templates/macros.html | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/formatters/templates/html_template.html b/src/auto_archiver/formatters/templates/html_template.html index 13ec15a..8bdf5ef 100644 --- a/src/auto_archiver/formatters/templates/html_template.html +++ b/src/auto_archiver/formatters/templates/html_template.html @@ -286,11 +286,11 @@ // logic for enabled/disabled greyscale // Get references to the checkboxes and images/videos const safeImageViewCheckbox = document.getElementById('safe-media-view'); - const imagesVideos = document.querySelectorAll('img, video'); + const visualPreviews = document.querySelectorAll('img, video,embed'); // Function to toggle grayscale effect function toggleGrayscale() { - imagesVideos.forEach(element => { + visualPreviews.forEach(element => { if (safeImageViewCheckbox.checked) { // Enable grayscale effect element.style.filter = 'grayscale(1)'; @@ -307,7 +307,7 @@ safeImageViewCheckbox.addEventListener('change', toggleGrayscale); // Handle the hover effect using JavaScript - imagesVideos.forEach(element => { + visualPreviews.forEach(element => { element.addEventListener('mouseenter', () => { // Disable grayscale effect on hover element.style.filter = 'none'; diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html index c2281c2..1373b43 100644 --- a/src/auto_archiver/formatters/templates/macros.html +++ b/src/auto_archiver/formatters/templates/macros.html @@ -32,6 +32,10 @@ No URL available for {{ m.key }}. Your browser does not support the video element. +{% elif 'application/pdf' in m.mimetype %} +
+ +
{% elif 'audio' in m.mimetype %}