Merge branch 'load_modules' into add_module_tests

# Conflicts: # src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
2025-02-11 13:08:08 +00:00 · 2025-02-11 13:08:08 +00:00 · c8cd7ea63c
commit c8cd7ea63c
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -15,15 +15,9 @@ from .module import BaseModule
 from typing import Any, List, Type, Tuple
-yaml: YAML = YAML()
+_yaml: YAML = YAML()
-b = yaml.load("""
+EMPTY_CONFIG = _yaml.load("""
          # This is a comment
          site.com,site2.com:
            key: value
            key2: value2
          """)
 EMPTY_CONFIG = yaml.load("""
 # Auto Archiver Configuration
 # Steps are the modules that will be run in the order they are defined
@ -149,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
    config = None
    try:
        with open(yaml_filename, "r", encoding="utf-8") as inf:
-            config = yaml.load(inf)
+            config = _yaml.load(inf)
    except FileNotFoundError:
        pass
@ -166,4 +160,4 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
    config_to_save.pop('urls', None)
    with open(yaml_filename, "w", encoding="utf-8") as outf:
-        yaml.dump(config_to_save, outf)
+        _yaml.dump(config_to_save, outf)
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@ -44,6 +44,7 @@ class Metadata:
        if overwrite_left:
            if right.status and len(right.status):
                self.status = right.status
            self._context.update(right._context)
            for k, v in right.metadata.items():
                assert k not in self.metadata or type(v) == type(self.get(k))
                if type(v) not in [dict, list, set] or k not in self.metadata:
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -20,7 +20,7 @@ from rich_argparse import RichHelpFormatter
 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
-from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
 from .module import available_modules, LazyBaseModule, get_module, setup_paths
 from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .module import BaseModule
@ -50,7 +50,7 @@ class AuthenticationJsonParseAction(JsonParseAction):
                        auth_dict = json.load(f)
                    except json.JSONDecodeError:
                        # maybe it's yaml, try that
-                        auth_dict = yaml.load(f)
+                        auth_dict = _yaml.load(f)
            except:
                pass
@ -424,8 +424,8 @@ class ArchivingOrchestrator:
        cached_result = None
        for d in self.databases:
            d.started(result)
-            if (local_result := d.fetch(result)):
+            if local_result := d.fetch(result):
-                cached_result = (cached_result or Metadata()).merge(local_result)
+                cached_result = (cached_result or Metadata()).merge(local_result).merge(result)
        if cached_result:
            logger.debug("Found previously archived entry")
            for d in self.databases:
--- a/src/auto_archiver/modules/api_db/manifest.py
+++ b/src/auto_archiver/modules/api_db/manifest.py
@ -1,7 +1,7 @@
 {
    "name": "Auto-Archiver API Database",
    "type": ["database"],
-    "entry_point": "api_db:AAApiDb",
+    "entry_point": "api_db::AAApiDb",
    "requires_setup": True,
    "dependencies": {
        "python": ["requests", "loguru"],
@ -23,7 +23,7 @@
            "default": None,
            "help": "which group of users have access to the archive in case public=false as author",
        },
-        "allow_rearchive": {
+        "use_api_cache": {
            "default": True,
            "type": "bool",
            "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
@ -43,7 +43,7 @@
 ### Features
 - **API Integration**: Supports querying for existing archives and submitting results.
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
+- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
 - **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
 - **Tagging and Metadata**: Adds tags and manages metadata for archives.
 - **Optional Storage**: Archives results conditionally based on configuration.
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@ -15,11 +15,11 @@ class AAApiDb(Database):
        """ query the database for the existence of this item.
            Helps avoid re-archiving the same URL multiple times.
        """
-        if not self.allow_rearchive: return
+        if not self.use_api_cache: return
        params = {"url": item.get_url(), "limit": 15}
        headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
-        response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
+        response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
        if response.status_code == 200:
            if len(response.json()):
@ -30,8 +30,7 @@ class AAApiDb(Database):
            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
        return False
-
+    def done(self, item: Metadata, cached: bool = False) -> None:
    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
        if not self.store_results: return
        if cached:
@ -39,12 +38,18 @@ class AAApiDb(Database):
            return
        logger.debug(f"saving archive of {item.get_url()} to the AA API.")
-        payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
+        payload = {
            'author_id': self.author_id,
            'url': item.get_url(),
            'public': self.public,
            'group_id': self.group_id,
            'tags': list(self.tags),
            'result': item.to_json(),
        }
        headers = {"Authorization": f"Bearer {self.api_token}"}
-        response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
+        response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
-        if response.status_code == 200:
+        if response.status_code == 201:
            logger.success(f"AA API: {response.json()}")
        else:
            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
--- a/src/auto_archiver/modules/atlos_db/manifest.py
+++ b/src/auto_archiver/modules/atlos_db/manifest.py
@ -1,7 +1,7 @@
 {
    "name": "Atlos Database",
    "type": ["database"],
-    "entry_point": "atlos_db:AtlosDb",
+    "entry_point": "atlos_db::AtlosDb",
    "requires_setup": True,
    "dependencies":
        {"python": ["loguru",
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@ -109,6 +109,6 @@ class GsheetsDb(Database):
            gw: GWorksheet = gsheet.get("worksheet")
            row: int = gsheet.get("row")
        elif self.sheet_id:
-            print(self.sheet_id)
+            logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
        return gw, row
--- a/src/auto_archiver/modules/telegram_extractor/manifest.py
+++ b/src/auto_archiver/modules/telegram_extractor/manifest.py
@ -13,7 +13,7 @@
        The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. 
        It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` 
        and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` 
-        is advised for more comprehensive functionality.
+        is advised for more comprehensive functionality, and higher quality media extraction.
        ### Features
 - Extracts images and videos from public Telegram message links (`t.me`).