Get parsing of manifest and combining with config file working

pull/183/head
Patrick Robertson 2025-01-21 20:03:10 +01:00
rodzic 241b35002c
commit 4830f99300
4 zmienionych plików z 121 dodań i 106 usunięć

Wyświetl plik

@ -5,22 +5,9 @@ flexible setup in various environments.
"""
import importlib
import argparse
import yaml
from dataclasses import dataclass, field
from typing import List
from collections import defaultdict
from loguru import logger
from ..archivers import Archiver
from ..feeders import Feeder
from ..databases import Database
from ..formatters import Formatter
from ..storages import Storage
from ..enrichers import Enricher
from . import Step
from ..utils import update_nested_dict
# @dataclass
@ -46,84 +33,45 @@ from ..utils import update_nested_dict
# self.cli_ops = {}
# self.config = {}
# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
# """
# if yaml_config_filename is provided, the --config argument is ignored,
# useful for library usage when the config values are preloaded
# overwrite_configs is a dict that overwrites the yaml file contents
# """
# # 1. parse CLI values
# if use_cli:
# parser = argparse.ArgumentParser(
# # prog = "auto-archiver",
# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
# epilog="Check the code at https://github.com/bellingcat/auto-archiver"
# )
# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
# """
# if yaml_config_filename is provided, the --config argument is ignored,
# useful for library usage when the config values are preloaded
# overwrite_configs is a dict that overwrites the yaml file contents
# """
# # 1. parse CLI values
# if use_cli:
# parser = argparse.ArgumentParser(
# # prog = "auto-archiver",
# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
# epilog="Check the code at https://github.com/bellingcat/auto-archiver"
# )
# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
# parser.add_argument('--version', action='version', version=__version__)
# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
# parser.add_argument('--version', action='version', version=__version__)
# # Iterate over all step subclasses to gather default configs and CLI arguments
# for configurable in self.configurable_parents:
# child: Step
# for child in configurable.__subclasses__():
# assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
# for config, details in child.configs().items():
# assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
# assert "." not in config, f"config property cannot contain dots('.'): {config}"
# config_path = f"{child.name}.{config}"
def format_config(config: dict) -> dict:
# Iterate over all step subclasses to gather default configs and CLI arguments
new_config = {}
for step, values in config['steps'].items():
new_config[f"--{step}"] = values
# format configurations
for name, confg_vals in config['configurations'].items():
for key, value in confg_vals.items():
assert "." not in key, "config key cannot contain '.'"
config_path = f"--{name}.{key}"
new_config[config_path] = value
# if use_cli:
# try:
# parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
# except argparse.ArgumentError:
# # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
# pass
return new_config
# self.defaults[config_path] = details["default"]
# if "cli_set" in details:
# self.cli_ops[config_path] = details["cli_set"]
# if use_cli:
# args = parser.parse_args()
# yaml_config_filename = yaml_config_filename or getattr(args, "config")
# else: args = {}
# # 2. read YAML config file (or use provided value)
# self.yaml_config = self.read_yaml(yaml_config_filename)
# update_nested_dict(self.yaml_config, overwrite_configs)
# # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
# self.config = defaultdict(dict)
# for config_path, default in self.defaults.items():
# child, config = tuple(config_path.split("."))
# val = getattr(args, config_path, None)
# if val is not None and config_path in self.cli_ops:
# val = self.cli_ops[config_path](val, default)
# if val is None:
# val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
# self.config[child][config] = val
# self.config = dict(self.config)
# # 4. STEPS: read steps and validate they exist
# steps = self.yaml_config.get("steps", {})
# assert "archivers" in steps, "your configuration steps are missing the archivers property"
# assert "storages" in steps, "your configuration steps are missing the storages property"
# self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
# self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config)
# self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
# self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
# self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
# self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
# logger.info(f"FEEDER: {self.feeder.name}")
# logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}")
# logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}")
# logger.info(f"DATABASES: {[x.name for x in self.databases]}")
# logger.info(f"STORAGES: {[x.name for x in self.storages]}")
# logger.info(f"FORMATTER: {self.formatter.name}")
class LoadFromFile (argparse.Action):
def __call__ (self, parser, namespace, values, option_string = None):
with values as f:
# parse arguments in the file and store them in the target namespace
parser.parse_args(f.read().split(), namespace)
def read_yaml(yaml_filename: str) -> dict:
with open(yaml_filename, "r", encoding="utf-8") as inf:
return yaml.safe_load(inf)
return format_config(yaml.safe_load(inf))

Wyświetl plik

@ -1,4 +1,6 @@
import ast
import os
import copy
from os.path import join, dirname
from typing import List
@ -11,15 +13,18 @@ _DEFAULT_MANIFEST = {
'external_dependencies': {},
'entry_point': '',
'version': '1.0',
'config': {}
}
def load_manifest(self, module):
def load_manifest(module):
# load the manifest file
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
with open(join(module, MANIFEST_FILE)) as f:
manifest = f.read()
manifest.update(ast.literal_eval(f.read()))
return manifest
def available_modules(self, additional_paths: List[str] = []) -> List[dict]:
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
@ -32,11 +37,15 @@ def available_modules(self, additional_paths: List[str] = []) -> List[dict]:
for module_folder in default_path + additional_paths:
# walk through each module in module_folder and check if it has a valid manifest
for folder in os.listdir(module_folder):
possible_module = join(module_folder, folder)
if not is_really_module(possible_module):
for possible_module in os.listdir(module_folder):
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
continue
# parse manifest and add to list of available modules
all_modules.append(possible_module)
if with_manifest:
manifest = load_manifest(possible_module_path)
else:
manifest = {}
all_modules.append((possible_module, possible_module_path, manifest))
return all_modules

Wyświetl plik

@ -5,9 +5,6 @@
"""
from __future__ import annotations
import ast
import os
from os.path import dirname, join
from typing import Generator, Union, List
from urllib.parse import urlparse
from ipaddress import ip_address
@ -51,23 +48,67 @@ class ArchivingOrchestrator:
def setup_parser(self):
parser = argparse.ArgumentParser(
# prog = "auto-archiver",
add_help=False,
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
)
parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
# override the default 'help' so we can inject all the configs and show those
parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
self.parser = parser
def setup_config(self):
def add_module_args(self, modules: list = None):
if not modules:
modules = available_modules(with_manifest=True)
for module_name, module_path, manifest in modules:
for name, kwargs in manifest['config'].items():
kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}"
self.parser.add_argument(f"--{module_name}.{name}", **kwargs)
def show_help(self):
# for the help message, we want to load *all* possible modules and show the help
# add configs as arg parser arguments
self.add_module_args()
self.parser.print_help()
exit()
def setup_config(self, config: dict) -> None:
# check what mode we're in
# if simple, we'll load just the modules that has requires_setup = False
# if full, we'll load all modules
if self.config.mode == 'simple':
for module in available_modules():
# load the module
manifest = load_manifest(module)
simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']]
self.add_module_args(simple_modules)
# now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required"
self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use')
self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use')
self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use')
self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use')
self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use')
self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use')
config.update(self.config.__dict__)
# reload the parser with the new arguments, now that we have them
self.config, unknown = self.parser.parse_known_args(config)
logger.warning(f"Ignoring unknown/unused arguments: {unknown}")
breakpoint()
logger.info(f"FEEDER: {self.config.feeders}")
logger.info(f"ENRICHERS: {self.config.enrichers}")
logger.info(f"ARCHIVERS: {self.config.archivers}")
logger.info(f"DATABASES: {self.config.databases}")
logger.info(f"STORAGES: {self.config.storages}")
logger.info(f"FORMATTER: {self.formatter.name}")
def run(self) -> None:
self.setup_parser()
@ -77,17 +118,21 @@ class ArchivingOrchestrator:
# load the config file to get the list of enabled items
self.config, _ = self.parser.parse_known_args()
# if help flag was called, then show the help
if self.config.help:
self.show_help()
# load the config file
config = {}
try:
config = read_yaml(self.config.config_file)
except FileNotFoundError:
if self.settings.config == DEFAULT_CONFIG_FILE:
# no config file found, let's do the setup with the default values
self.setup_config()
else:
if self.config.config_file != DEFAULT_CONFIG_FILE:
logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
exit()
self.setup_config(config)
breakpoint()
config.parse()

Wyświetl plik

@ -29,5 +29,18 @@ the broader archiving framework.
metadata objects. Some dropins are included in this generic_archiver by default, but
custom dropins can be created to handle additional websites and passed to the archiver
via the command line using the `--dropins` option (TODO!).
"""
""",
'config': {
"facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
"subtitles": {"default": True, "help": "download subtitles if available"},
"comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
"livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
"live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
"proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
}
}