Merge pull request #159 from bellingcat/print_pdf

Add 'print_pdf' option to the screenshot enricher. Fixes #132
pull/165/head
Patrick Robertson 2025-01-06 18:13:38 +01:00 zatwierdzone przez GitHub
commit bffa3a6254
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
5 zmienionych plików z 43 dodań i 5 usunięć

Wyświetl plik

@ -105,6 +105,22 @@ configurations:
screenshot_enricher: screenshot_enricher:
width: 1280 width: 1280
height: 2300 height: 2300
# to save as pdf, uncomment the following lines and adjust the print options
# save_to_pdf: true
# print_options:
# for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html
# background: true
# orientation: "portrait"
# scale: 1
# page_width: 8.5in
# page_height: 11in
# margin_top: 0.4in
# margin_bottom: 0.4in
# margin_left: 0.4in
# margin_right: 0.4in
# page_ranges: ""
# shrink_to_fit: true
wayback_archiver_enricher: wayback_archiver_enricher:
timeout: 10 timeout: 10
key: "wayback key" key: "wayback key"

Wyświetl plik

@ -1,5 +1,7 @@
from loguru import logger from loguru import logger
import time, os import time, os
import base64
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher):
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
} }
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url): if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return return
logger.debug(f"Enriching screenshot for {url=}") logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
try: try:
driver.get(url) driver.get(url)
time.sleep(int(self.sleep_before_screenshot)) time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file) driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
if self.save_to_pdf:
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
pdf = driver.print_page(driver.print_options)
with open(pdf_file, "wb") as f:
f.write(base64.b64decode(pdf))
to_enrich.add_media(Media(filename=pdf_file), id="pdf")
except TimeoutException: except TimeoutException:
logger.info("TimeoutException loading page for screenshot") logger.info("TimeoutException loading page for screenshot")
except Exception as e: except Exception as e:

Wyświetl plik

@ -286,11 +286,11 @@
// logic for enabled/disabled greyscale // logic for enabled/disabled greyscale
// Get references to the checkboxes and images/videos // Get references to the checkboxes and images/videos
const safeImageViewCheckbox = document.getElementById('safe-media-view'); const safeImageViewCheckbox = document.getElementById('safe-media-view');
const imagesVideos = document.querySelectorAll('img, video'); const visualPreviews = document.querySelectorAll('img, video,embed');
// Function to toggle grayscale effect // Function to toggle grayscale effect
function toggleGrayscale() { function toggleGrayscale() {
imagesVideos.forEach(element => { visualPreviews.forEach(element => {
if (safeImageViewCheckbox.checked) { if (safeImageViewCheckbox.checked) {
// Enable grayscale effect // Enable grayscale effect
element.style.filter = 'grayscale(1)'; element.style.filter = 'grayscale(1)';
@ -307,7 +307,7 @@
safeImageViewCheckbox.addEventListener('change', toggleGrayscale); safeImageViewCheckbox.addEventListener('change', toggleGrayscale);
// Handle the hover effect using JavaScript // Handle the hover effect using JavaScript
imagesVideos.forEach(element => { visualPreviews.forEach(element => {
element.addEventListener('mouseenter', () => { element.addEventListener('mouseenter', () => {
// Disable grayscale effect on hover // Disable grayscale effect on hover
element.style.filter = 'none'; element.style.filter = 'none';

Wyświetl plik

@ -32,6 +32,10 @@ No URL available for {{ m.key }}.
Your browser does not support the video element. Your browser does not support the video element.
</video> </video>
</div> </div>
{% elif 'application/pdf' in m.mimetype %}
<div>
<embed src="{{ url }}" width="100%" height="400px"/>
</div>
{% elif 'audio' in m.mimetype %} {% elif 'audio' in m.mimetype %}
<div> <div>
<audio controls> <audio controls>

Wyświetl plik

@ -2,18 +2,24 @@ from __future__ import annotations
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.proxy import Proxy, ProxyType from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.print_page_options import PrintOptions
from loguru import logger from loguru import logger
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
import time import time
class Webdriver: class Webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "") -> webdriver: def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
self.width = width self.width = width
self.height = height self.height = height
self.timeout_seconds = timeout_seconds self.timeout_seconds = timeout_seconds
self.facebook_accept_cookies = facebook_accept_cookies self.facebook_accept_cookies = facebook_accept_cookies
self.http_proxy = http_proxy self.http_proxy = http_proxy
# create and set print options
self.print_options = PrintOptions()
for k, v in print_options.items():
setattr(self.print_options, k, v)
def __enter__(self) -> webdriver: def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
@ -24,6 +30,7 @@ class Webdriver:
self.driver = webdriver.Firefox(options=options) self.driver = webdriver.Firefox(options=options)
self.driver.set_window_size(self.width, self.height) self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds) self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options
except TimeoutException as e: except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")