enables api_db cache queries if configured with new option (#113)

2023-12-12 19:20:26 +00:00 · 2023-12-12 19:20:26 +00:00 · 6f36e92e02
commit 6f36e92e02
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@ -105,7 +105,8 @@ class Metadata:

    def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
        ts = self.get("timestamp")
-        if not ts: return ts
+        if not ts: return 
+        if type(ts) == float: ts = datetime.datetime.fromtimestamp(ts)
        if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
        if iso: return ts.isoformat()
        return ts
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -77,7 +77,7 @@ class ArchivingOrchestrator:
        if cached_result:
            logger.debug("Found previously archived entry")
            for d in self.databases:
-                d.done(cached_result)
+                d.done(cached_result, cached=True)
            return cached_result

        # 3 - call archivers until one succeeds
--- a/src/auto_archiver/databases/api_db.py
+++ b/src/auto_archiver/databases/api_db.py
@ -1,3 +1,4 @@
+from typing import Union
 import requests, os
 from loguru import logger

@ -14,6 +15,7 @@ class AAApiDb(Database):
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
+        self.allow_rearchive = bool(self.allow_rearchive)
        self.assert_valid_string("api_endpoint")
        self.assert_valid_string("api_secret")

@ -21,16 +23,37 @@ class AAApiDb(Database):
    def configs() -> dict:
        return {
            "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
-            "api_secret": {"default": None, "help": "API authentication secret"},
+            "api_secret": {"default": None, "help": "API Basic authentication secret [deprecating soon]"},
+            "api_token": {"default": None, "help": "API Bearer token, to be preferred over secret (Basic auth) going forward"},
            "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
            "author_id": {"default": None, "help": "which email to assign as author"},
            "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
+            "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
            "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
        }
+    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+        """ query the database for the existence of this item"""
+        if not self.allow_rearchive: return
+        
+        params = {"url": item.get_url(), "limit": 1}
+        headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
+        response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)

-    def done(self, item: Metadata) -> None:
+        if response.status_code == 200:
+            logger.success(f"API returned a previously archived instance: {response.json()}")
+            # TODO: can we do better than just returning the first result?
+            return Metadata.from_dict(response.json()[0]["result"])
+        
+        logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
+        return False
+
+
+    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
-        logger.info(f"saving archive of {item.get_url()} to the AA API.")
+        if cached: 
+            logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
+            return
+        logger.debug(f"saving archive of {item.get_url()} to the AA API.")

        payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
        response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret))
@ -39,3 +62,5 @@ class AAApiDb(Database):
            logger.success(f"AA API: {response.json()}")
        else:
            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
+
+    
--- a/src/auto_archiver/databases/console_db.py
+++ b/src/auto_archiver/databases/console_db.py
@ -27,6 +27,6 @@ class ConsoleDb(Database):
    def aborted(self, item: Metadata) -> None:
        logger.warning(f"ABORTED {item}")

-    def done(self, item: Metadata) -> None:
+    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
        logger.success(f"DONE {item}")
--- a/src/auto_archiver/databases/csv_db.py
+++ b/src/auto_archiver/databases/csv_db.py
@ -24,7 +24,7 @@ class CSVDb(Database):
            "csv_file": {"default": "db.csv", "help": "CSV file name"}
        }

-    def done(self, item: Metadata) -> None:
+    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
        logger.success(f"DONE {item}")
        is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
--- a/src/auto_archiver/databases/database.py
+++ b/src/auto_archiver/databases/database.py
@ -36,6 +36,6 @@ class Database(Step, ABC):
        return False

    @abstractmethod
-    def done(self, item: Metadata) -> None:
+    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
        pass
--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@ -41,7 +41,7 @@ class GsheetsDb(Database):
        """check if the given item has been archived already"""
        return False

-    def done(self, item: Metadata) -> None:
+    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
        logger.success(f"DONE {item.get_url()}")
        gw, row = self._retrieve_gsheet(item)
@ -57,8 +57,10 @@ class GsheetsDb(Database):
                    cell_updates.append((row, col, final_value))
            except Exception as e:
                logger.error(f"Unable to batch {col}={final_value} due to {e}")
-
-        cell_updates.append((row, 'status', item.status))
+        status_message = item.status
+        if cached:
+            status_message = f"[cached] {status_message}"
+        cell_updates.append((row, 'status', status_message))

        media: Media = item.get_final_media()
        if hasattr(media, "urls"):
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "7"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "1"
+_PATCH = "2"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""