diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 5c4bcfa..8ffcb03 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -105,7 +105,8 @@ class Metadata: def get_timestamp(self, utc=True, iso=True) -> datetime.datetime: ts = self.get("timestamp") - if not ts: return ts + if not ts: return + if type(ts) == float: ts = datetime.datetime.fromtimestamp(ts) if utc: ts = ts.replace(tzinfo=datetime.timezone.utc) if iso: return ts.isoformat() return ts diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 35ff73b..fb5e2ab 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -77,7 +77,7 @@ class ArchivingOrchestrator: if cached_result: logger.debug("Found previously archived entry") for d in self.databases: - d.done(cached_result) + d.done(cached_result, cached=True) return cached_result # 3 - call archivers until one succeeds diff --git a/src/auto_archiver/databases/api_db.py b/src/auto_archiver/databases/api_db.py index a1f256d..4800bef 100644 --- a/src/auto_archiver/databases/api_db.py +++ b/src/auto_archiver/databases/api_db.py @@ -1,3 +1,4 @@ +from typing import Union import requests, os from loguru import logger @@ -14,6 +15,7 @@ class AAApiDb(Database): def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called super().__init__(config) + self.allow_rearchive = bool(self.allow_rearchive) self.assert_valid_string("api_endpoint") self.assert_valid_string("api_secret") @@ -21,16 +23,37 @@ class AAApiDb(Database): def configs() -> dict: return { "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, - "api_secret": {"default": None, "help": "API authentication secret"}, + "api_secret": {"default": None, "help": "API Basic authentication secret [deprecating soon]"}, + "api_token": {"default": None, "help": "API Bearer token, to be preferred over secret (Basic auth) going forward"}, "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, "author_id": {"default": None, "help": "which email to assign as author"}, "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, + "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, } + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """ query the database for the existence of this item""" + if not self.allow_rearchive: return + + params = {"url": item.get_url(), "limit": 1} + headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} + response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) - def done(self, item: Metadata) -> None: + if response.status_code == 200: + logger.success(f"API returned a previously archived instance: {response.json()}") + # TODO: can we do better than just returning the first result? + return Metadata.from_dict(response.json()[0]["result"]) + + logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") + return False + + + def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" - logger.info(f"saving archive of {item.get_url()} to the AA API.") + if cached: + logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") + return + logger.debug(f"saving archive of {item.get_url()} to the AA API.") payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret)) @@ -39,3 +62,5 @@ class AAApiDb(Database): logger.success(f"AA API: {response.json()}") else: logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") + + \ No newline at end of file diff --git a/src/auto_archiver/databases/console_db.py b/src/auto_archiver/databases/console_db.py index a22bc8e..bd3112d 100644 --- a/src/auto_archiver/databases/console_db.py +++ b/src/auto_archiver/databases/console_db.py @@ -27,6 +27,6 @@ class ConsoleDb(Database): def aborted(self, item: Metadata) -> None: logger.warning(f"ABORTED {item}") - def done(self, item: Metadata) -> None: + def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" logger.success(f"DONE {item}") \ No newline at end of file diff --git a/src/auto_archiver/databases/csv_db.py b/src/auto_archiver/databases/csv_db.py index 0743047..f0d7153 100644 --- a/src/auto_archiver/databases/csv_db.py +++ b/src/auto_archiver/databases/csv_db.py @@ -24,7 +24,7 @@ class CSVDb(Database): "csv_file": {"default": "db.csv", "help": "CSV file name"} } - def done(self, item: Metadata) -> None: + def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" logger.success(f"DONE {item}") is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0 diff --git a/src/auto_archiver/databases/database.py b/src/auto_archiver/databases/database.py index 01b7869..1ea1982 100644 --- a/src/auto_archiver/databases/database.py +++ b/src/auto_archiver/databases/database.py @@ -36,6 +36,6 @@ class Database(Step, ABC): return False @abstractmethod - def done(self, item: Metadata) -> None: + def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" pass diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py index b183721..cd36844 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -41,7 +41,7 @@ class GsheetsDb(Database): """check if the given item has been archived already""" return False - def done(self, item: Metadata) -> None: + def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" logger.success(f"DONE {item.get_url()}") gw, row = self._retrieve_gsheet(item) @@ -57,8 +57,10 @@ class GsheetsDb(Database): cell_updates.append((row, col, final_value)) except Exception as e: logger.error(f"Unable to batch {col}={final_value} due to {e}") - - cell_updates.append((row, 'status', item.status)) + status_message = item.status + if cached: + status_message = f"[cached] {status_message}" + cell_updates.append((row, 'status', status_message)) media: Media = item.get_final_media() if hasattr(media, "urls"): diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py index ddd8579..65c6acd 100644 --- a/src/auto_archiver/version.py +++ b/src/auto_archiver/version.py @@ -3,7 +3,7 @@ _MAJOR = "0" _MINOR = "7" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "1" +_PATCH = "2" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""