metadata.json hardcode in storage. add new metadata_json_enricher. log level change in orchestrator

2025-06-17 09:51:19 +01:00 · 2025-06-17 09:51:19 +01:00 · b3adc5603a
commit b3adc5603a
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -541,7 +541,7 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                yield self.feed_item(item)
                url_count += 1

-        logger.success(f"Processed {url_count} URL(s)")
+        logger.info(f"Processed {url_count} URL(s)")
        self.cleanup()

    def feed_item(self, item: Metadata) -> Metadata:
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@ -100,7 +100,12 @@ class Storage(BaseModule):

        # Handle filename_generator logic
        filename_generator = self.filename_generator
-        if filename_generator == "random":
+        # DM 9th Jun 25 - special case for metadata.json file in metadata_json_enricher
+        # where we want the filename to remain metadata.json 
+        # TODO - should this be a config option to keep the original filename? Is it useful anywhere else?
+        if filename.endswith('metadata'):
+            filename = 'metadata'
+        elif filename_generator == "random":
            filename = random_str(24)
        elif filename_generator == "static":
            # load the hash_enricher module
--- a/src/auto_archiver/modules/metadata_json_enricher/init.py
+++ b/src/auto_archiver/modules/metadata_json_enricher/init.py
@ -0,0 +1 @@
+from .metadata_json_enricher import MetadataJsonEnricher
--- a/src/auto_archiver/modules/metadata_json_enricher/manifest.py
+++ b/src/auto_archiver/modules/metadata_json_enricher/manifest.py
@ -0,0 +1,37 @@
+{
+    "name": "Metadata JSON Enricher",
+    "type": ["enricher"],
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["loguru"],
+    },
+    "configs": {
+        # "width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
+        # "height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
+        # "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
+        # "sleep_before_screenshot": {
+        #     "default": 4,
+        #     "type": "int",
+        #     "help": "seconds to wait for the pages to load before taking screenshot",
+        # },
+        # "http_proxy": {
+        #     "default": "",
+        #     "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
+        # },
+        # "save_to_pdf": {
+        #     "default": False,
+        #     "type": "bool",
+        #     "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
+        # },
+        # "print_options": {
+        #     "default": {},
+        #     "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
+        #     "type": "json_loader",
+        # },
+    },
+    "description": """
+
+    Writes all the metadata to a json file so can be parsed by other tools.
+
+    """,
+}
--- a/src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py
+++ b/src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py
@ -0,0 +1,21 @@
+import json
+from loguru import logger
+import os
+
+from auto_archiver.core import Enricher
+from auto_archiver.core import Media, Metadata
+
+class MetadataJsonEnricher(Enricher):
+    def __init__(self):
+        super().__init__()
+
+    def enrich(self, to_enrich: Metadata) -> None:
+        url = to_enrich.get_url()
+
+        logger.debug(f"Metadata JSON Enricher for {url=}")
+
+        item_path = os.path.join(self.tmp_dir, f"metadata.json")
+        with open(item_path, mode="w", encoding="utf-8") as outf:
+            json.dump(to_enrich.to_dict(), outf, indent=4, default=str)
+        
+        to_enrich.add_media(Media(filename=item_path), id="metadata_json")
				`@ -0,0 +1 @@`
				`from .metadata_json_enricher import MetadataJsonEnricher`