diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index ea19c92..66ecd74 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -121,7 +121,7 @@ class TelethonArchiver(Archiverv2): media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') - tmp_dir = item.get("tmp_dir") + tmp_dir = item.get_tmp_dir() group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index ba3785a..26aae68 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -68,13 +68,15 @@ class GsheetsDb(Database): batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('timestamp', item.get_timestamp()) + if (screenshot := item.get_media_by_id("screenshot")): + batch_if_valid('screenshot', screenshot.cdn_url) + # batch_if_valid('status', item.status) # TODO: AFTER ENRICHMENTS # batch_if_valid('hash', media.hash) # batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') # batch_if_valid('thumbnail_index', result.thumbnail_index) # batch_if_valid('duration', result.duration, str(result.duration)) - # batch_if_valid('screenshot', result.screenshot) # if result.wacz is not None: # batch_if_valid('wacz', result.wacz) # batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') @@ -91,5 +93,5 @@ class GsheetsDb(Database): def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: gw: GWorksheet = item.get("gsheet").get("worksheet") row: int = item.get("gsheet").get("row") - #TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now + # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now return gw, row diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 503ea2c..2a871d1 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,2 +1,3 @@ from .enricher import Enricher -from .screenshot_enricher import ScreenshotEnricher \ No newline at end of file +from .screenshot_enricher import ScreenshotEnricher +from .wayback_enricher import WaybackEnricher \ No newline at end of file diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py index faf43d8..9d11276 100644 --- a/src/enrichers/enricher.py +++ b/src/enrichers/enricher.py @@ -18,4 +18,4 @@ class Enricher(Step, ABC): return Step.init(name, config, Enricher) @abstractmethod - def enrich(self, item: Metadata) -> Metadata: pass + def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/enrichers/screenshot_enricher.py b/src/enrichers/screenshot_enricher.py index 5018859..b008e52 100644 --- a/src/enrichers/screenshot_enricher.py +++ b/src/enrichers/screenshot_enricher.py @@ -1,13 +1,14 @@ +from media import Media from utils import Webdriver from . import Enricher from metadata import Metadata from loguru import logger +import time, uuid, os from selenium.common.exceptions import TimeoutException -import time class ScreenshotEnricher(Enricher): - name = "screenshot" + name = "screenshot_enricher" @staticmethod def configs() -> dict: @@ -17,16 +18,18 @@ class ScreenshotEnricher(Enricher): "timeout": {"default": 60, "help": "timeout for taking the screenshot"} } - def enrich(self, item: Metadata) -> Metadata: - url = self.get_url(item) - print(f"enriching {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"Enriching screenshot for {url=}") + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: try: driver.get(url) time.sleep(2) + screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") + driver.save_screenshot(screenshot_file) + to_enrich.add_media(Media(filename=screenshot_file, id="screenshot")) except TimeoutException: logger.info("TimeoutException loading page for screenshot") - - #TODO: return saved object - driver.save_screenshot("TODO-HASH_OR_UUID.png") - return None + except Exception as e: + logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") + # return None diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py new file mode 100644 index 0000000..09a43e0 --- /dev/null +++ b/src/enrichers/wayback_enricher.py @@ -0,0 +1,68 @@ +from utils import Webdriver +from . import Enricher +from metadata import Metadata +from loguru import logger +from selenium.common.exceptions import TimeoutException +import time, requests + + +class WaybackEnricher(Enricher): + """ + Submits the current URL to the webarchive and returns a job_id or completed archive + """ + name = "wayback_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key" + assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret" + + @staticmethod + def configs() -> dict: + return { + "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."}, + "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, + "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} + } + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"Enriching wayback for {url=}") + + ia_headers = { + "Accept": "application/json", + "Authorization": f"LOW {self.key}:{self.secret}" + } + r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url}) + + if r.status_code != 200: + logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}") + to_enrich.set("wayback", em) + return + + # check job status + job_id = r.json()['job_id'] + + # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information + start_time = time.time() + wayback_url = False + attempt = 1 + while not wayback_url and time.time() - start_time <= self.timeout: + try: + + logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})") + r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) + r_json = r_status.json() + if r_status.status_code == 200 and r_json['status'] == 'success': + wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}" + except Exception as e: + logger.warning(f"error fetching status for {url=} due to: {e}") + if not wayback_url: + attempt += 1 + time.sleep(1) # TODO: can be improved with exponential backoff + + if wayback_url: + to_enrich.set("wayback", wayback_url) + else: + to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'}) diff --git a/src/formatters/html_formatter.py b/src/formatters/html_formatter.py index 6c278f5..7443568 100644 --- a/src/formatters/html_formatter.py +++ b/src/formatters/html_formatter.py @@ -30,7 +30,7 @@ class HtmlFormatter(Formatter): media=item.media, metadata=item.get_clean_metadata() ) - html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html") + html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) return Media(filename=html_path) diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index fa278eb..fc986f0 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -60,6 +60,9 @@ {% endif %}
  • key: {{ m.key }}
  • type: {{ m.mimetype }}
  • + {% if m.id | length >0 %} +
  • id: {{ m.id }}
  • + {% endif %} @@ -91,11 +94,13 @@ {% for key in metadata %} {{ key }} - {{ metadata[key] }} + {{ metadata[key] | urlize }} {% endfor %} +
    +

    made with bellingcat/auto-archiver, add suggestions and report issues on the project's github page

    \ No newline at end of file diff --git a/src/media.py b/src/media.py index 58eae27..3c416be 100644 --- a/src/media.py +++ b/src/media.py @@ -12,10 +12,13 @@ class Media: key: str = None cdn_url: str = None mimetype: str = None # eg: image/jpeg - # id: str = None + id: str = None # in case this type of media needs a special id, eg: screenshot # hash: str = None # TODO: added by enrichers def set_mimetype(self) -> Media: if not self.mimetype: self.mimetype = mimetypes.guess_type(self.filename)[0] return self + + def is_video(self) -> bool: + return self.mimetype.startswith("video") diff --git a/src/metadata.py b/src/metadata.py index ceece8d..7af923c 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -28,9 +28,12 @@ class Metadata: """ merges two Metadata instances, will overwrite according to overwrite_left flag """ + if right is None: return self if overwrite_left: - self.status = right.status + if right.status and len(right.status): + self.status = right.status self.rearchivable |= right.rearchivable + self.tmp_keys |= right.tmp_keys for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: @@ -76,6 +79,12 @@ class Metadata: def get_title(self) -> str: return self.get("title") + def set_tmp_dir(self, tmp_dir: str) -> Metadata: + return self.set("tmp_dir", tmp_dir, True) + + def get_tmp_dir(self) -> str: + return self.get("tmp_dir") + def set_timestamp(self, timestamp: datetime.datetime) -> Metadata: assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance" return self.set("timestamp", timestamp) @@ -88,9 +97,15 @@ class Metadata: return ts def add_media(self, media: Media) -> Metadata: + if media is None: return media.set_mimetype() return self.media.append(media) + def get_media_by_id(self, id:str) -> Media: + for m in self.media: + if m.id == id: return m + return None + def set_final_media(self, final: Media) -> Metadata: if final: if self.final_media: @@ -100,6 +115,7 @@ class Metadata: return self def get_single_media(self) -> Media: + #TODO: could be refactored to use a custom media.id if self.final_media: return self.final_media return self.media[0] diff --git a/src/orchestrator.py b/src/orchestrator.py index 5a8ff31..3d554e0 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -52,74 +52,6 @@ Cisticola considerations: 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping """ -# @dataclass -# class Metadata: -# # does not handle files, only primitives -# # the only piece of logic to handle files is the archiver, enricher, and storage -# status: str -# # title: str -# # url: str -# # hash: str -# main_file: Metadata -# metadata: Dict[str, Metadata] - -# @staticmethod -# def merge(left, right : Metadata, overwrite_left=True) -> Metadata: -# # should return a merged version of the Metadata -# # will work for archived() and enriched() -# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left -# pass - -# def get(self, key) -> Union[Metadata, str]: -# # goes through metadata and returns the Metadata available -# pass - -# def as_json(self) -> str: -# # converts all metadata and data into JSON -# pass - - -""" -@dataclass -class ArchiveResult: - # maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes - status: str - url: str - metadata: Metadata - # title, url, hash, other={} - # cdn_url: str = None - # thumbnail: str = None - # thumbnail_index: str = None - # duration: float = None - # title: str = None - # timestamp: datetime.datetime = None - # screenshot: str = None - # wacz: str = None - # hash: str = None - # media: list = field(default_factory=list) - - def __init__(self) -> None: pass - - def update(self, metadata) -> None: - # receive a Metadata instance and update itself with it! - pass - - def as_json(self) -> str: - # converts all metadata and data into JSON - pass -""" - -""" -There is a Superclass for: - * Database (should_process) - -How can GSheets work? it needs to feed from a READER (GSheets Feeder) - -Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files? -The context metadata should include a temporary folder (maybe a LocalStorage instance?) -""" - - class ArchivingOrchestrator: def __init__(self, config) -> None: # in config.py we should test that the archivers exist and log mismatches (blocking execution) @@ -128,7 +60,7 @@ class ArchivingOrchestrator: # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI # where does that update/processing happen? in config.py - # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ + # reflection for Archiver to know which child classes it has? use Archiver.__subclasses__ # self.archivers = [ # Archiver.init(a, config) # for a in config.archivers @@ -166,7 +98,7 @@ class ArchivingOrchestrator: print("ARCHIVING", item) try: with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - item.set("tmp_dir", tmp_dir, True) + item.set_tmp_dir(tmp_dir) result = self.archive(item) print(result) except KeyboardInterrupt: @@ -226,6 +158,7 @@ class ArchivingOrchestrator: # do they need to be refreshed with every execution? # this is where the Hashes come from, the place with access to all content # the archiver does not have access to storage + # a.download(result) # TODO: refactor so there's not merge here result.merge(a.download(result)) # TODO: fix logic if True or result.is_success(): break @@ -237,7 +170,7 @@ class ArchivingOrchestrator: # maybe as a PDF? or a Markdown file # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator for e in self.enrichers: - result.merge(e.enrich(result)) + e.enrich(result) # store media unstored_media = result.media[::]