kopia lustrzana https://github.com/bellingcat/auto-archiver
Get parsing of manifest and combining with config file working
rodzic
241b35002c
commit
4830f99300
|
@ -5,22 +5,9 @@ flexible setup in various environments.
|
|||
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import argparse
|
||||
import yaml
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
from collections import defaultdict
|
||||
from loguru import logger
|
||||
|
||||
from ..archivers import Archiver
|
||||
from ..feeders import Feeder
|
||||
from ..databases import Database
|
||||
from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
from ..enrichers import Enricher
|
||||
from . import Step
|
||||
from ..utils import update_nested_dict
|
||||
|
||||
|
||||
# @dataclass
|
||||
|
@ -46,84 +33,45 @@ from ..utils import update_nested_dict
|
|||
# self.cli_ops = {}
|
||||
# self.config = {}
|
||||
|
||||
# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
|
||||
# """
|
||||
# if yaml_config_filename is provided, the --config argument is ignored,
|
||||
# useful for library usage when the config values are preloaded
|
||||
# overwrite_configs is a dict that overwrites the yaml file contents
|
||||
# """
|
||||
# # 1. parse CLI values
|
||||
# if use_cli:
|
||||
# parser = argparse.ArgumentParser(
|
||||
# # prog = "auto-archiver",
|
||||
# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
# epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
# )
|
||||
# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
|
||||
# """
|
||||
# if yaml_config_filename is provided, the --config argument is ignored,
|
||||
# useful for library usage when the config values are preloaded
|
||||
# overwrite_configs is a dict that overwrites the yaml file contents
|
||||
# """
|
||||
# # 1. parse CLI values
|
||||
# if use_cli:
|
||||
# parser = argparse.ArgumentParser(
|
||||
# # prog = "auto-archiver",
|
||||
# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
# epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
# )
|
||||
|
||||
# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
|
||||
# parser.add_argument('--version', action='version', version=__version__)
|
||||
# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
|
||||
# parser.add_argument('--version', action='version', version=__version__)
|
||||
|
||||
# # Iterate over all step subclasses to gather default configs and CLI arguments
|
||||
# for configurable in self.configurable_parents:
|
||||
# child: Step
|
||||
# for child in configurable.__subclasses__():
|
||||
# assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
|
||||
# for config, details in child.configs().items():
|
||||
# assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
# assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
# config_path = f"{child.name}.{config}"
|
||||
def format_config(config: dict) -> dict:
|
||||
# Iterate over all step subclasses to gather default configs and CLI arguments
|
||||
new_config = {}
|
||||
for step, values in config['steps'].items():
|
||||
new_config[f"--{step}"] = values
|
||||
|
||||
# format configurations
|
||||
for name, confg_vals in config['configurations'].items():
|
||||
for key, value in confg_vals.items():
|
||||
assert "." not in key, "config key cannot contain '.'"
|
||||
config_path = f"--{name}.{key}"
|
||||
new_config[config_path] = value
|
||||
|
||||
# if use_cli:
|
||||
# try:
|
||||
# parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
# except argparse.ArgumentError:
|
||||
# # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
|
||||
# pass
|
||||
return new_config
|
||||
|
||||
# self.defaults[config_path] = details["default"]
|
||||
# if "cli_set" in details:
|
||||
# self.cli_ops[config_path] = details["cli_set"]
|
||||
|
||||
# if use_cli:
|
||||
# args = parser.parse_args()
|
||||
# yaml_config_filename = yaml_config_filename or getattr(args, "config")
|
||||
# else: args = {}
|
||||
|
||||
# # 2. read YAML config file (or use provided value)
|
||||
# self.yaml_config = self.read_yaml(yaml_config_filename)
|
||||
# update_nested_dict(self.yaml_config, overwrite_configs)
|
||||
|
||||
# # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||
# self.config = defaultdict(dict)
|
||||
# for config_path, default in self.defaults.items():
|
||||
# child, config = tuple(config_path.split("."))
|
||||
# val = getattr(args, config_path, None)
|
||||
# if val is not None and config_path in self.cli_ops:
|
||||
# val = self.cli_ops[config_path](val, default)
|
||||
# if val is None:
|
||||
# val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
|
||||
# self.config[child][config] = val
|
||||
# self.config = dict(self.config)
|
||||
|
||||
# # 4. STEPS: read steps and validate they exist
|
||||
# steps = self.yaml_config.get("steps", {})
|
||||
# assert "archivers" in steps, "your configuration steps are missing the archivers property"
|
||||
# assert "storages" in steps, "your configuration steps are missing the storages property"
|
||||
|
||||
# self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
|
||||
# self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config)
|
||||
# self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
||||
# self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
|
||||
# self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
||||
# self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
|
||||
|
||||
# logger.info(f"FEEDER: {self.feeder.name}")
|
||||
# logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}")
|
||||
# logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}")
|
||||
# logger.info(f"DATABASES: {[x.name for x in self.databases]}")
|
||||
# logger.info(f"STORAGES: {[x.name for x in self.storages]}")
|
||||
# logger.info(f"FORMATTER: {self.formatter.name}")
|
||||
class LoadFromFile (argparse.Action):
|
||||
def __call__ (self, parser, namespace, values, option_string = None):
|
||||
with values as f:
|
||||
# parse arguments in the file and store them in the target namespace
|
||||
parser.parse_args(f.read().split(), namespace)
|
||||
|
||||
def read_yaml(yaml_filename: str) -> dict:
|
||||
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
||||
return yaml.safe_load(inf)
|
||||
return format_config(yaml.safe_load(inf))
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import ast
|
||||
import os
|
||||
import copy
|
||||
from os.path import join, dirname
|
||||
from typing import List
|
||||
|
||||
|
@ -11,15 +13,18 @@ _DEFAULT_MANIFEST = {
|
|||
'external_dependencies': {},
|
||||
'entry_point': '',
|
||||
'version': '1.0',
|
||||
'config': {}
|
||||
}
|
||||
|
||||
def load_manifest(self, module):
|
||||
def load_manifest(module):
|
||||
# load the manifest file
|
||||
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
|
||||
|
||||
with open(join(module, MANIFEST_FILE)) as f:
|
||||
manifest = f.read()
|
||||
manifest.update(ast.literal_eval(f.read()))
|
||||
return manifest
|
||||
|
||||
def available_modules(self, additional_paths: List[str] = []) -> List[dict]:
|
||||
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]:
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
|
@ -32,11 +37,15 @@ def available_modules(self, additional_paths: List[str] = []) -> List[dict]:
|
|||
|
||||
for module_folder in default_path + additional_paths:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
for folder in os.listdir(module_folder):
|
||||
possible_module = join(module_folder, folder)
|
||||
if not is_really_module(possible_module):
|
||||
for possible_module in os.listdir(module_folder):
|
||||
possible_module_path = join(module_folder, possible_module)
|
||||
if not is_really_module(possible_module_path):
|
||||
continue
|
||||
# parse manifest and add to list of available modules
|
||||
all_modules.append(possible_module)
|
||||
if with_manifest:
|
||||
manifest = load_manifest(possible_module_path)
|
||||
else:
|
||||
manifest = {}
|
||||
all_modules.append((possible_module, possible_module_path, manifest))
|
||||
|
||||
return all_modules
|
|
@ -5,9 +5,6 @@
|
|||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import ast
|
||||
import os
|
||||
from os.path import dirname, join
|
||||
from typing import Generator, Union, List
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
|
@ -51,23 +48,67 @@ class ArchivingOrchestrator:
|
|||
def setup_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
add_help=False,
|
||||
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
|
||||
self.parser = parser
|
||||
|
||||
def setup_config(self):
|
||||
def add_module_args(self, modules: list = None):
|
||||
if not modules:
|
||||
modules = available_modules(with_manifest=True)
|
||||
|
||||
for module_name, module_path, manifest in modules:
|
||||
for name, kwargs in manifest['config'].items():
|
||||
kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}"
|
||||
self.parser.add_argument(f"--{module_name}.{name}", **kwargs)
|
||||
|
||||
def show_help(self):
|
||||
# for the help message, we want to load *all* possible modules and show the help
|
||||
# add configs as arg parser arguments
|
||||
self.add_module_args()
|
||||
|
||||
self.parser.print_help()
|
||||
exit()
|
||||
|
||||
def setup_config(self, config: dict) -> None:
|
||||
# check what mode we're in
|
||||
# if simple, we'll load just the modules that has requires_setup = False
|
||||
# if full, we'll load all modules
|
||||
if self.config.mode == 'simple':
|
||||
for module in available_modules():
|
||||
# load the module
|
||||
manifest = load_manifest(module)
|
||||
|
||||
simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']]
|
||||
self.add_module_args(simple_modules)
|
||||
|
||||
# now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required"
|
||||
self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use')
|
||||
self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use')
|
||||
self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use')
|
||||
self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use')
|
||||
self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use')
|
||||
self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use')
|
||||
|
||||
|
||||
config.update(self.config.__dict__)
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
self.config, unknown = self.parser.parse_known_args(config)
|
||||
logger.warning(f"Ignoring unknown/unused arguments: {unknown}")
|
||||
|
||||
breakpoint()
|
||||
|
||||
|
||||
logger.info(f"FEEDER: {self.config.feeders}")
|
||||
logger.info(f"ENRICHERS: {self.config.enrichers}")
|
||||
logger.info(f"ARCHIVERS: {self.config.archivers}")
|
||||
logger.info(f"DATABASES: {self.config.databases}")
|
||||
logger.info(f"STORAGES: {self.config.storages}")
|
||||
logger.info(f"FORMATTER: {self.formatter.name}")
|
||||
|
||||
|
||||
|
||||
def run(self) -> None:
|
||||
self.setup_parser()
|
||||
|
@ -77,17 +118,21 @@ class ArchivingOrchestrator:
|
|||
# load the config file to get the list of enabled items
|
||||
self.config, _ = self.parser.parse_known_args()
|
||||
|
||||
# if help flag was called, then show the help
|
||||
if self.config.help:
|
||||
self.show_help()
|
||||
# load the config file
|
||||
config = {}
|
||||
|
||||
try:
|
||||
config = read_yaml(self.config.config_file)
|
||||
except FileNotFoundError:
|
||||
if self.settings.config == DEFAULT_CONFIG_FILE:
|
||||
# no config file found, let's do the setup with the default values
|
||||
self.setup_config()
|
||||
else:
|
||||
if self.config.config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
|
||||
self.setup_config(config)
|
||||
|
||||
breakpoint()
|
||||
config.parse()
|
||||
|
||||
|
|
|
@ -29,5 +29,18 @@ the broader archiving framework.
|
|||
metadata objects. Some dropins are included in this generic_archiver by default, but
|
||||
custom dropins can be created to handle additional websites and passed to the archiver
|
||||
via the command line using the `--dropins` option (TODO!).
|
||||
"""
|
||||
""",
|
||||
'config': {
|
||||
"facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
|
||||
"subtitles": {"default": True, "help": "download subtitles if available"},
|
||||
"comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
|
||||
"livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
|
||||
"live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
|
||||
"proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
|
||||
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
|
||||
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
|
||||
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
|
||||
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
|
||||
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
|
||||
}
|
||||
}
|
Ładowanie…
Reference in New Issue