auto-archiver/src/auto_archiver/enrichers/screenshot_enricher.py

41 wiersze
1.9 KiB
Python
Czysty Zwykły widok Historia

2022-11-15 15:00:52 +00:00
from loguru import logger
import time, os
2022-12-14 14:01:39 +00:00
from selenium.common.exceptions import TimeoutException
2022-11-15 15:00:52 +00:00
2023-01-21 19:01:02 +00:00
from . import Enricher
from ..utils import Webdriver, UrlUtil, random_str
from ..core import Media, Metadata, ArchivingContext
2022-11-15 15:00:52 +00:00
class ScreenshotEnricher(Enricher):
2023-01-11 00:03:47 +00:00
name = "screenshot_enricher"
2022-11-15 15:00:52 +00:00
@staticmethod
def configs() -> dict:
return {
"width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"},
2023-02-27 09:30:06 +00:00
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
2022-11-15 15:00:52 +00:00
}
2023-01-11 00:03:47 +00:00
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
2023-02-17 15:45:58 +00:00
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
2023-01-11 00:03:47 +00:00
logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver:
2022-12-14 14:01:39 +00:00
try:
driver.get(url)
2023-02-27 09:30:06 +00:00
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
2023-01-11 00:03:47 +00:00
driver.save_screenshot(screenshot_file)
2023-01-13 02:12:08 +00:00
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
2022-12-14 14:01:39 +00:00
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
2023-01-11 00:03:47 +00:00
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")