From a786d4bb0e72df3a46abe2fea66c5c0513c6bbfe Mon Sep 17 00:00:00 2001 From: Miguel Sozinho Ramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 13 Dec 2023 11:26:46 +0000 Subject: [PATCH] chooses most complete result from api (#116) --- src/auto_archiver/core/metadata.py | 13 +++++++++++++ src/auto_archiver/databases/api_db.py | 8 ++++---- src/auto_archiver/version.py | 2 +- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 8ffcb03..3b73dcd 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -165,3 +165,16 @@ class Metadata: def __str__(self) -> str: return self.__repr__() + + + @staticmethod + def choose_most_complete(results: List[Metadata]) -> Metadata: + # returns the most complete result from a list of results + # prioritizes results with more media, then more metadata + if len(results) == 0: return None + if len(results) == 1: return results[0] + most_complete = results[0] + for r in results[1:]: + if len(r.media) > len(most_complete.media): most_complete = r + elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r + return most_complete \ No newline at end of file diff --git a/src/auto_archiver/databases/api_db.py b/src/auto_archiver/databases/api_db.py index b33b146..92ae6bf 100644 --- a/src/auto_archiver/databases/api_db.py +++ b/src/auto_archiver/databases/api_db.py @@ -35,15 +35,15 @@ class AAApiDb(Database): """ query the database for the existence of this item""" if not self.allow_rearchive: return - params = {"url": item.get_url(), "limit": 1} + params = {"url": item.get_url(), "limit": 15} headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) if response.status_code == 200: if len(response.json()): - logger.success(f"API returned a previously archived instance: {response.json()}") - # TODO: can we do better than just returning the most recent result? - return Metadata.from_dict(response.json()[0]["result"]) + logger.success(f"API returned {len(response.json())} previously archived instance(s)") + fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()] + return Metadata.choose_most_complete(fetched_metadata) else: logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") return False diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py index 828e019..1411185 100644 --- a/src/auto_archiver/version.py +++ b/src/auto_archiver/version.py @@ -3,7 +3,7 @@ _MAJOR = "0" _MINOR = "7" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "4" +_PATCH = "5" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""