diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index ea19c92..66ecd74 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -121,7 +121,7 @@ class TelethonArchiver(Archiverv2): media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') - tmp_dir = item.get("tmp_dir") + tmp_dir = item.get_tmp_dir() group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index ba3785a..26aae68 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -68,13 +68,15 @@ class GsheetsDb(Database): batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('timestamp', item.get_timestamp()) + if (screenshot := item.get_media_by_id("screenshot")): + batch_if_valid('screenshot', screenshot.cdn_url) + # batch_if_valid('status', item.status) # TODO: AFTER ENRICHMENTS # batch_if_valid('hash', media.hash) # batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') # batch_if_valid('thumbnail_index', result.thumbnail_index) # batch_if_valid('duration', result.duration, str(result.duration)) - # batch_if_valid('screenshot', result.screenshot) # if result.wacz is not None: # batch_if_valid('wacz', result.wacz) # batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') @@ -91,5 +93,5 @@ class GsheetsDb(Database): def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: gw: GWorksheet = item.get("gsheet").get("worksheet") row: int = item.get("gsheet").get("row") - #TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now + # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now return gw, row diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 503ea2c..2a871d1 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,2 +1,3 @@ from .enricher import Enricher -from .screenshot_enricher import ScreenshotEnricher \ No newline at end of file +from .screenshot_enricher import ScreenshotEnricher +from .wayback_enricher import WaybackEnricher \ No newline at end of file diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py index faf43d8..9d11276 100644 --- a/src/enrichers/enricher.py +++ b/src/enrichers/enricher.py @@ -18,4 +18,4 @@ class Enricher(Step, ABC): return Step.init(name, config, Enricher) @abstractmethod - def enrich(self, item: Metadata) -> Metadata: pass + def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/enrichers/screenshot_enricher.py b/src/enrichers/screenshot_enricher.py index 5018859..b008e52 100644 --- a/src/enrichers/screenshot_enricher.py +++ b/src/enrichers/screenshot_enricher.py @@ -1,13 +1,14 @@ +from media import Media from utils import Webdriver from . import Enricher from metadata import Metadata from loguru import logger +import time, uuid, os from selenium.common.exceptions import TimeoutException -import time class ScreenshotEnricher(Enricher): - name = "screenshot" + name = "screenshot_enricher" @staticmethod def configs() -> dict: @@ -17,16 +18,18 @@ class ScreenshotEnricher(Enricher): "timeout": {"default": 60, "help": "timeout for taking the screenshot"} } - def enrich(self, item: Metadata) -> Metadata: - url = self.get_url(item) - print(f"enriching {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"Enriching screenshot for {url=}") + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: try: driver.get(url) time.sleep(2) + screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") + driver.save_screenshot(screenshot_file) + to_enrich.add_media(Media(filename=screenshot_file, id="screenshot")) except TimeoutException: logger.info("TimeoutException loading page for screenshot") - - #TODO: return saved object - driver.save_screenshot("TODO-HASH_OR_UUID.png") - return None + except Exception as e: + logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") + # return None diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py new file mode 100644 index 0000000..09a43e0 --- /dev/null +++ b/src/enrichers/wayback_enricher.py @@ -0,0 +1,68 @@ +from utils import Webdriver +from . import Enricher +from metadata import Metadata +from loguru import logger +from selenium.common.exceptions import TimeoutException +import time, requests + + +class WaybackEnricher(Enricher): + """ + Submits the current URL to the webarchive and returns a job_id or completed archive + """ + name = "wayback_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key" + assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret" + + @staticmethod + def configs() -> dict: + return { + "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."}, + "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, + "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} + } + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"Enriching wayback for {url=}") + + ia_headers = { + "Accept": "application/json", + "Authorization": f"LOW {self.key}:{self.secret}" + } + r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url}) + + if r.status_code != 200: + logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}") + to_enrich.set("wayback", em) + return + + # check job status + job_id = r.json()['job_id'] + + # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information + start_time = time.time() + wayback_url = False + attempt = 1 + while not wayback_url and time.time() - start_time <= self.timeout: + try: + + logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})") + r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) + r_json = r_status.json() + if r_status.status_code == 200 and r_json['status'] == 'success': + wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}" + except Exception as e: + logger.warning(f"error fetching status for {url=} due to: {e}") + if not wayback_url: + attempt += 1 + time.sleep(1) # TODO: can be improved with exponential backoff + + if wayback_url: + to_enrich.set("wayback", wayback_url) + else: + to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'}) diff --git a/src/formatters/html_formatter.py b/src/formatters/html_formatter.py index 6c278f5..7443568 100644 --- a/src/formatters/html_formatter.py +++ b/src/formatters/html_formatter.py @@ -30,7 +30,7 @@ class HtmlFormatter(Formatter): media=item.media, metadata=item.get_clean_metadata() ) - html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html") + html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) return Media(filename=html_path) diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index fa278eb..fc986f0 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -60,6 +60,9 @@ {% endif %}
made with bellingcat/auto-archiver, add suggestions and report issues on the project's github page