diff --git a/example.orchestration.yaml b/example.orchestration.yaml index ef36efd..a58bc29 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -105,6 +105,22 @@ configurations: screenshot_enricher: width: 1280 height: 2300 + # to save as pdf, uncomment the following lines and adjust the print options + # save_to_pdf: true + # print_options: + # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html + # background: true + # orientation: "portrait" + # scale: 1 + # page_width: 8.5in + # page_height: 11in + # margin_top: 0.4in + # margin_bottom: 0.4in + # margin_left: 0.4in + # margin_right: 0.4in + # page_ranges: "" + # shrink_to_fit: true + wayback_archiver_enricher: timeout: 10 key: "wayback key" diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index 69f466b..b2ef096 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -1,5 +1,7 @@ from loguru import logger import time, os +import base64 + from selenium.common.exceptions import TimeoutException @@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher): "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + "print_options": {"default": {}, "help": "options to pass to the pdf printer"} } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") return logger.debug(f"Enriching screenshot for {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver: try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") driver.save_screenshot(screenshot_file) to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") + if self.save_to_pdf: + pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf") + pdf = driver.print_page(driver.print_options) + with open(pdf_file, "wb") as f: + f.write(base64.b64decode(pdf)) + to_enrich.add_media(Media(filename=pdf_file), id="pdf") except TimeoutException: logger.info("TimeoutException loading page for screenshot") except Exception as e: diff --git a/src/auto_archiver/formatters/templates/html_template.html b/src/auto_archiver/formatters/templates/html_template.html index 13ec15a..8bdf5ef 100644 --- a/src/auto_archiver/formatters/templates/html_template.html +++ b/src/auto_archiver/formatters/templates/html_template.html @@ -286,11 +286,11 @@ // logic for enabled/disabled greyscale // Get references to the checkboxes and images/videos const safeImageViewCheckbox = document.getElementById('safe-media-view'); - const imagesVideos = document.querySelectorAll('img, video'); + const visualPreviews = document.querySelectorAll('img, video,embed'); // Function to toggle grayscale effect function toggleGrayscale() { - imagesVideos.forEach(element => { + visualPreviews.forEach(element => { if (safeImageViewCheckbox.checked) { // Enable grayscale effect element.style.filter = 'grayscale(1)'; @@ -307,7 +307,7 @@ safeImageViewCheckbox.addEventListener('change', toggleGrayscale); // Handle the hover effect using JavaScript - imagesVideos.forEach(element => { + visualPreviews.forEach(element => { element.addEventListener('mouseenter', () => { // Disable grayscale effect on hover element.style.filter = 'none'; diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html index c2281c2..1373b43 100644 --- a/src/auto_archiver/formatters/templates/macros.html +++ b/src/auto_archiver/formatters/templates/macros.html @@ -32,6 +32,10 @@ No URL available for {{ m.key }}. Your browser does not support the video element. +{% elif 'application/pdf' in m.mimetype %} +