metadata.json hardcode in storage. add new metadata_json_enricher. log level change in orchestrator

pull/320/head
Dave Mateer 2025-06-17 09:51:19 +01:00
rodzic ba3f1a52e8
commit b3adc5603a
5 zmienionych plików z 66 dodań i 2 usunięć

Wyświetl plik

@ -541,7 +541,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
yield self.feed_item(item)
url_count += 1
logger.success(f"Processed {url_count} URL(s)")
logger.info(f"Processed {url_count} URL(s)")
self.cleanup()
def feed_item(self, item: Metadata) -> Metadata:

Wyświetl plik

@ -100,7 +100,12 @@ class Storage(BaseModule):
# Handle filename_generator logic
filename_generator = self.filename_generator
if filename_generator == "random":
# DM 9th Jun 25 - special case for metadata.json file in metadata_json_enricher
# where we want the filename to remain metadata.json
# TODO - should this be a config option to keep the original filename? Is it useful anywhere else?
if filename.endswith('metadata'):
filename = 'metadata'
elif filename_generator == "random":
filename = random_str(24)
elif filename_generator == "static":
# load the hash_enricher module

Wyświetl plik

@ -0,0 +1 @@
from .metadata_json_enricher import MetadataJsonEnricher

Wyświetl plik

@ -0,0 +1,37 @@
{
"name": "Metadata JSON Enricher",
"type": ["enricher"],
"requires_setup": True,
"dependencies": {
"python": ["loguru"],
},
"configs": {
# "width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
# "height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
# "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
# "sleep_before_screenshot": {
# "default": 4,
# "type": "int",
# "help": "seconds to wait for the pages to load before taking screenshot",
# },
# "http_proxy": {
# "default": "",
# "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
# },
# "save_to_pdf": {
# "default": False,
# "type": "bool",
# "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
# },
# "print_options": {
# "default": {},
# "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
# "type": "json_loader",
# },
},
"description": """
Writes all the metadata to a json file so can be parsed by other tools.
""",
}

Wyświetl plik

@ -0,0 +1,21 @@
import json
from loguru import logger
import os
from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata
class MetadataJsonEnricher(Enricher):
def __init__(self):
super().__init__()
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"Metadata JSON Enricher for {url=}")
item_path = os.path.join(self.tmp_dir, f"metadata.json")
with open(item_path, mode="w", encoding="utf-8") as outf:
json.dump(to_enrich.to_dict(), outf, indent=4, default=str)
to_enrich.add_media(Media(filename=item_path), id="metadata_json")