diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index 49efd66..c986536 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -23,10 +23,15 @@ class Archiver(Step): # used when archivers need to login or do other one-time setup pass - def clean_url(self, url: str) -> str: - # used to clean unnecessary URL parameters + def sanitize_url(self, url: str) -> str: + # used to clean unnecessary URL parameters OR unfurl redirect links return url + def is_rearchivable(self, url: str) -> bool: + # archivers can signal if it does not make sense to rearchive a piece of content + # default is rearchiving + return True + def _guess_file_type(self, path: str) -> str: """ Receives a URL or filename and returns global mimetype like 'image' or 'video' @@ -57,19 +62,3 @@ class Archiver(Step): @abstractmethod def download(self, item: Metadata) -> Metadata: pass - - # TODO: how to fix allow predictable key - # def get_key(self, filename): - # """ - # returns a key in the format "[archiverName]_[filename]" includes extension - # """ - # tail = os.path.split(filename)[1] # returns filename.ext from full path - # _id, extension = os.path.splitext(tail) # returns [filename, .ext] - # if 'unknown_video' in _id: - # _id = _id.replace('unknown_video', 'jpg') - - # # long filenames can cause problems, so trim them if necessary - # if len(_id) > 128: - # _id = _id[-128:] - - # return f'{self.name}_{_id}{extension}' \ No newline at end of file diff --git a/src/auto_archiver/archivers/telegram_archiver.py b/src/auto_archiver/archivers/telegram_archiver.py index b579dc4..6df421b 100644 --- a/src/auto_archiver/archivers/telegram_archiver.py +++ b/src/auto_archiver/archivers/telegram_archiver.py @@ -22,6 +22,10 @@ class TelegramArchiver(Archiver): def configs() -> dict: return {} + def is_rearchivable(self, url: str) -> bool: + # telegram posts are static + return False + def download(self, item: Metadata) -> Metadata: url = item.get_url() # detect URLs that we definitely cannot handle diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/archivers/telethon_archiver.py index 20bf28e..2fd556b 100644 --- a/src/auto_archiver/archivers/telethon_archiver.py +++ b/src/auto_archiver/archivers/telethon_archiver.py @@ -39,6 +39,10 @@ class TelethonArchiver(Archiver): } } + def is_rearchivable(self, url: str) -> bool: + # telegram posts are static + return False + def setup(self) -> None: """ 1. trigger login process for telegram or proceed if already saved in a session file diff --git a/src/auto_archiver/archivers/tiktok_archiver.py b/src/auto_archiver/archivers/tiktok_archiver.py index 0c41193..532df25 100644 --- a/src/auto_archiver/archivers/tiktok_archiver.py +++ b/src/auto_archiver/archivers/tiktok_archiver.py @@ -19,6 +19,10 @@ class TiktokArchiver(Archiver): @staticmethod def configs() -> dict: return {} + + def is_rearchivable(self, url: str) -> bool: + # TikTok posts are static + return False def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index b5b6dda..a811abf 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -11,6 +11,7 @@ from . import Archiver from ..core import Metadata from ..core import Media + class TwitterArchiver(Archiver): """ This Twitter Archiver uses unofficial scraping methods. @@ -18,6 +19,7 @@ class TwitterArchiver(Archiver): name = "twitter_archiver" link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") + link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*") def __init__(self, config: dict) -> None: super().__init__(config) @@ -26,6 +28,22 @@ class TwitterArchiver(Archiver): def configs() -> dict: return {} + def sanitize_url(self, url: str) -> str: + # expand URL if t.co and clean tracker GET params + if 'https://t.co/' in url: + try: + r = requests.get(url) + logger.debug(f'Expanded url {url} to {r.url}') + url = r.url + except: + logger.error(f'Failed to expand url {url}') + # https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w + return self.link_clean_pattern.sub("\\1", url) + + def is_rearchivable(self, url: str) -> bool: + # Twitter posts are static + return False + def download(self, item: Metadata) -> Metadata: """ if this url is archivable will download post info and look for other posts from the same group with media. diff --git a/src/auto_archiver/archivers/vk_archiver.py b/src/auto_archiver/archivers/vk_archiver.py index 1d76282..b1febe0 100644 --- a/src/auto_archiver/archivers/vk_archiver.py +++ b/src/auto_archiver/archivers/vk_archiver.py @@ -28,6 +28,10 @@ class VkArchiver(Archiver): "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, } + def is_rearchivable(self, url: str) -> bool: + # VK content is static + return False + def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index dfbd2ec..09587e5 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -22,7 +22,7 @@ class Metadata: tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) final_media: Media = None # can be overwritten by formatters - rearchivable: bool = False + rearchivable: bool = True # defaults to true, archivers can overwrite def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ @@ -140,14 +140,5 @@ class Metadata: def get_clean_metadata(self) -> Metadata: return dict( {k: v for k, v in self.metadata.items() if k not in self.tmp_keys}, - **{"processed_at": self._processed_at} # TODO: move to enrichment - ) - - def cleanup(self) -> Metadata: - # TODO: refactor so it returns a JSON with all intended properties, except tmp_keys - # the code below leads to errors if database needs tmp_keys after they are removed - # """removes temporary metadata fields, ideally called after all ops except writing""" - # for tmp_key in self.tmp_keys: - # self.metadata.pop(tmp_key, None) - # self.tmp_keys = set() - pass + **{"processed_at": self._processed_at} + ) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index eaef270..25c2854 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -91,19 +91,19 @@ class ArchivingOrchestrator: # default feeder is a list with 1 element def archive(self, result: Metadata) -> Union[Metadata, None]: - url = result.get_url() - # TODO: clean urls - for a in self.archivers: - url = a.clean_url(url) - result.set_url(url) - # should_archive = False - # for d in self.databases: should_archive |= d.should_process(url) - # should storages also be able to check? - # for s in self.storages: should_archive |= s.should_process(url) + original_url = result.get_url() - # if not should_archive: - # print("skipping") - # return "skipping" + # 1 - cleanup + # each archiver is responsible for cleaning/expanding its own URLs + url = original_url + for a in self.archivers: url = a.sanitize_url(url) + result.set_url(url) + if original_url != url: result.set("original_url", original_url) + + # 2 - rearchiving logic + notify start to DB + # archivers can signal whether the content is rearchivable: eg: tweet vs webpage + for a in self.archivers: result.rearchivable |= a.is_rearchivable(url) + logger.debug(f"{result.rearchivable=} for {url=}") # signal to DB that archiving has started # and propagate already archived if it exists @@ -117,33 +117,33 @@ class ArchivingOrchestrator: if (local_result := d.fetch(result)): cached_result = (cached_result or Metadata()).merge(local_result) if cached_result and not cached_result.rearchivable: + logger.debug("Found previously archived entry") for d in self.databases: d.done(cached_result) return cached_result - # vk, telethon, ... + # 3 - call archivers until one succeeds for a in self.archivers: - # with automatic try/catch in download + archived (+ the other ops below) - # should the archivers come with the config already? are there configs which change at runtime? - # think not, so no need to pass config as parameter - # do they need to be refreshed with every execution? - # this is where the Hashes come from, the place with access to all content - # the archiver does not have access to storage - # a.download(result) # TODO: refactor so there's not merge here logger.info(f"Trying archiver {a.name}") - result.merge(a.download(result)) - if result.is_success(): break + try: + # Q: should this be refactored so it's just a.download(result)? + result.merge(a.download(result)) + if result.is_success(): break + except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}") # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? # should it call the HTMLgenerator as if it's not an enrichment? # eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes # then how to execute it last? should there also be post-processors? are there other examples? # maybe as a PDF? or a Markdown file - # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator + + # 4 - call enrichers: have access to archived content, can generate metadata and Media + # eg: screenshot, wacz, webarchive, thumbnails for e in self.enrichers: e.enrich(result) - # store media + # 5 - store media + # looks for Media in result.media and also result.media[x].properties (as list or dict values) for s in self.storages: for m in result.media: s.store(m, result) # modifies media @@ -155,19 +155,14 @@ class ArchivingOrchestrator: for prop_media in prop: s.store(prop_media, result) - # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • - # TODO: should there only be 1 formatter? - # for f in self.formatters: - # result.merge(f.format(result)) - # final format and store it + # 6 - format and store formatted if needed + # enrichers typically need access to already stored URLs etc if (final_media := self.formatter.format(result)): for s in self.storages: s.store(final_media, result) result.set_final_media(final_media) # signal completion to databases (DBs, Google Sheets, CSV, ...) - # a hash registration service could be one database: forensic archiving - result.cleanup() for d in self.databases: d.done(result) return result