kopia lustrzana https://github.com/bellingcat/auto-archiver
metadata.json hardcode in storage. add new metadata_json_enricher. log level change in orchestrator
rodzic
ba3f1a52e8
commit
b3adc5603a
|
@ -541,7 +541,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
yield self.feed_item(item)
|
||||
url_count += 1
|
||||
|
||||
logger.success(f"Processed {url_count} URL(s)")
|
||||
logger.info(f"Processed {url_count} URL(s)")
|
||||
self.cleanup()
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
|
|
|
@ -100,7 +100,12 @@ class Storage(BaseModule):
|
|||
|
||||
# Handle filename_generator logic
|
||||
filename_generator = self.filename_generator
|
||||
if filename_generator == "random":
|
||||
# DM 9th Jun 25 - special case for metadata.json file in metadata_json_enricher
|
||||
# where we want the filename to remain metadata.json
|
||||
# TODO - should this be a config option to keep the original filename? Is it useful anywhere else?
|
||||
if filename.endswith('metadata'):
|
||||
filename = 'metadata'
|
||||
elif filename_generator == "random":
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .metadata_json_enricher import MetadataJsonEnricher
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"name": "Metadata JSON Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
# "width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
|
||||
# "height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
|
||||
# "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
|
||||
# "sleep_before_screenshot": {
|
||||
# "default": 4,
|
||||
# "type": "int",
|
||||
# "help": "seconds to wait for the pages to load before taking screenshot",
|
||||
# },
|
||||
# "http_proxy": {
|
||||
# "default": "",
|
||||
# "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
|
||||
# },
|
||||
# "save_to_pdf": {
|
||||
# "default": False,
|
||||
# "type": "bool",
|
||||
# "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
|
||||
# },
|
||||
# "print_options": {
|
||||
# "default": {},
|
||||
# "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
# "type": "json_loader",
|
||||
# },
|
||||
},
|
||||
"description": """
|
||||
|
||||
Writes all the metadata to a json file so can be parsed by other tools.
|
||||
|
||||
""",
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
import json
|
||||
from loguru import logger
|
||||
import os
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
class MetadataJsonEnricher(Enricher):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
logger.debug(f"Metadata JSON Enricher for {url=}")
|
||||
|
||||
item_path = os.path.join(self.tmp_dir, f"metadata.json")
|
||||
with open(item_path, mode="w", encoding="utf-8") as outf:
|
||||
json.dump(to_enrich.to_dict(), outf, indent=4, default=str)
|
||||
|
||||
to_enrich.add_media(Media(filename=item_path), id="metadata_json")
|
Ładowanie…
Reference in New Issue