Further tweaks based on __manifest__.py files

Loading configs now works
pull/183/head
Patrick Robertson 2025-01-22 13:11:43 +01:00
rodzic 7b3a1468cd
commit 54995ad6ab
7 zmienionych plików z 214 dodań i 86 usunięć

43
poetry.lock wygenerowano
Wyświetl plik

@ -1043,7 +1043,7 @@ version = "3.0.0"
description = "Python port of markdown-it. Markdown parsing, done right!"
optional = false
python-versions = ">=3.8"
groups = ["docs"]
groups = ["main", "docs"]
files = [
{file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
{file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
@ -1179,7 +1179,7 @@ version = "0.1.2"
description = "Markdown URL utilities"
optional = false
python-versions = ">=3.7"
groups = ["docs"]
groups = ["main", "docs"]
files = [
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
@ -1654,7 +1654,7 @@ version = "2.19.1"
description = "Pygments is a syntax highlighting package written in Python."
optional = false
python-versions = ">=3.8"
groups = ["docs"]
groups = ["main", "docs"]
files = [
{file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
{file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
@ -2031,6 +2031,41 @@ files = [
[package.dependencies]
six = ">=1.7.0"
[[package]]
name = "rich"
version = "13.9.4"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
optional = false
python-versions = ">=3.8.0"
groups = ["main"]
files = [
{file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
{file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
]
[package.dependencies]
markdown-it-py = ">=2.2.0"
pygments = ">=2.13.0,<3.0.0"
typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
[package.extras]
jupyter = ["ipywidgets (>=7.5.1,<9)"]
[[package]]
name = "rich-argparse"
version = "1.6.0"
description = "Rich help formatters for argparse and optparse"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7"},
{file = "rich_argparse-1.6.0.tar.gz", hash = "sha256:092083c30da186f25bcdff8b1d47fdfb571288510fb051e0488a72cc3128de13"},
]
[package.dependencies]
rich = ">=11.0.0"
[[package]]
name = "rsa"
version = "4.9"
@ -2966,4 +3001,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "462c7c5f9d1fbae895d6299ba0b690b6e24d0655a4c9fc79f75ddef4eec222f8"
content-hash = "911543169cbd6c68ab3392a052ea58917539acdfbc6511e591f8a2b497443cdc"

Wyświetl plik

@ -56,6 +56,7 @@ dependencies = [
"retrying (>=0.0.0)",
"tsp-client (>=0.0.0)",
"certvalidator (>=0.0.0)",
"rich-argparse (>=1.6.0,<2.0.0)",
]
[tool.poetry.group.dev.dependencies]

Wyświetl plik

@ -4,14 +4,11 @@ It supports CLI argument parsing, loading from YAML file, and overrides to allow
flexible setup in various environments.
"""
import argparse
import yaml
from configparser import ConfigParser
from dataclasses import dataclass, field
# @dataclass
# class Config:
# configurable_parents = [
# Feeder,
# Enricher,
@ -50,21 +47,6 @@ from dataclasses import dataclass, field
# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
# parser.add_argument('--version', action='version', version=__version__)
def format_config(config: dict) -> dict:
# Iterate over all step subclasses to gather default configs and CLI arguments
new_config = {}
for step, values in config['steps'].items():
new_config[f"--{step}"] = values
# format configurations
for name, confg_vals in config['configurations'].items():
for key, value in confg_vals.items():
assert "." not in key, "config key cannot contain '.'"
config_path = f"--{name}.{key}"
new_config[config_path] = value
return new_config
class LoadFromFile (argparse.Action):
def __call__ (self, parser, namespace, values, option_string = None):
@ -72,6 +54,14 @@ class LoadFromFile (argparse.Action):
# parse arguments in the file and store them in the target namespace
parser.parse_args(f.read().split(), namespace)
def read_yaml(yaml_filename: str) -> dict:
with open(yaml_filename, "r", encoding="utf-8") as inf:
return format_config(yaml.safe_load(inf))
def read_config(config_filename: str) -> dict:
config = ConfigParser()
config.read(config_filename)
# setup basic format
if 'STEPS' not in config.sections():
config.add_section("STEPS")
return config
def store_config(config: ConfigParser, config_filename: str):
with open(config_filename, "w", encoding="utf-8") as outf:
config.write(outf)

Wyświetl plik

@ -1,11 +1,23 @@
import ast
from dataclasses import dataclass, field
import os
import copy
from os.path import join, dirname
from typing import List
MODULE_TYPES = [
'feeder',
'enricher',
'archiver',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
_DEFAULT_MANIFEST = {
'name': '',
'author': 'Bellingcat',
'requires_setup': True,
'depends': [],
@ -13,20 +25,54 @@ _DEFAULT_MANIFEST = {
'external_dependencies': {},
'entry_point': '',
'version': '1.0',
'config': {}
'configs': {}
}
def load_manifest(module):
@dataclass
class Module:
name: str
display_name: str
type: list
entry_point: str
depends: list
external_dependencies: dict
requires_setup: bool
configs: dict
description: str
path: str
manifest: dict
def __init__(self, module_name, path, manifest):
self.name = module_name
self.path = path
self.manifest = manifest
if manifest:
self.display_name = manifest['name']
self.type = manifest['type']
self.entry_point = manifest['entry_point']
self.depends = manifest['depends']
self.external_dependencies = manifest['external_dependencies']
self.requires_setup = manifest['requires_setup']
self.configs = manifest['configs']
self.description = manifest['description']
def __repr__(self):
return f"Module<'{self.display_name}' ({self.name})>"
def load_manifest(module_path):
print(f"Loading manifest for module {module_path}")
# load the manifest file
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
with open(join(module, MANIFEST_FILE)) as f:
with open(join(module_path, MANIFEST_FILE)) as f:
manifest.update(ast.literal_eval(f.read()))
return manifest
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]:
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(name):
if os.path.isfile(join(name, MANIFEST_FILE)):
@ -46,6 +92,6 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals
manifest = load_manifest(possible_module_path)
else:
manifest = {}
all_modules.append((possible_module, possible_module_path, manifest))
all_modules.append(Module(possible_module, possible_module_path, manifest))
return all_modules

Wyświetl plik

@ -9,6 +9,11 @@ from typing import Generator, Union, List
from urllib.parse import urlparse
from ipaddress import ip_address
import argparse
import configparser
import os
from os.path import join, dirname
from rich_argparse import RichHelpFormatter
from .context import ArchivingContext
@ -20,14 +25,15 @@ from ..enrichers import Enricher
from ..databases import Database
from .metadata import Metadata
from ..version import __version__
from .config import read_yaml
from .loader import available_modules, load_manifest
from .config import read_config, store_config
from .loader import available_modules, Module, MODULE_TYPES
import tempfile, traceback
from loguru import logger
DEFAULT_CONFIG_FILE = "orchestration.yaml"
class ArchivingOrchestrator:
# def __init__(self, config: Config) -> None:
@ -45,95 +51,145 @@ class ArchivingOrchestrator:
# logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
# self.cleanup()
def setup_parser(self):
def setup_basic_parser(self):
parser = argparse.ArgumentParser(
# prog = "auto-archiver",
add_help=False,
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
description="""
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
""",
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
formatter_class=RichHelpFormatter,
)
parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
# override the default 'help' so we can inject all the configs and show those
parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
self.parser = parser
def add_module_args(self, modules: list = None):
if not modules:
modules = available_modules(with_manifest=True)
parser.add_argument('-s', '--store', action='store_true', dest='store', help='Store the created config in the config file')
self.basic_parser = parser
for module_name, module_path, manifest in modules:
for name, kwargs in manifest['config'].items():
kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}"
self.parser.add_argument(f"--{module_name}.{name}", **kwargs)
def setup_complete_parser(self, basic_config: dict, ini_config: dict, unused_args: list[str]) -> None:
parser = argparse.ArgumentParser(
parents = [self.basic_parser],
add_help=False,
)
def show_help(self):
# for the help message, we want to load *all* possible modules and show the help
# add configs as arg parser arguments
self.add_module_args()
self.parser.print_help()
exit()
def setup_config(self, config: dict) -> None:
# check what mode we're in
# if we have a config file, use that to decide which modules to load
# if simple, we'll load just the modules that has requires_setup = False
# if full, we'll load all modules
if self.config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']]
self.add_module_args(simple_modules)
if ini_config:
# only load the modules enabled in config
enabled_modules = []
for module_type in MODULE_TYPES:
try:
enabled_modules.extend(ini_config.get("STEPS", module_type))
except configparser.NoOptionError:
pass
# now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required"
self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use')
self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use')
self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use')
self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use')
self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use')
self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use')
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
for module_type in MODULE_TYPES:
if modules := getattr(basic_config, f"{module_type}s", []):
enabled_modules.extend(modules)
self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
# add them to the config
for module in simple_modules:
for module_type in module.type:
existing_modules = config['STEPS'] = module.name
ini_config.setdefault(f"{module_type}s", []).append(module.name)
else:
# load all modules, they're not using the 'simple' mode
self.add_module_args(available_modules(with_manifest=True), parser)
parser.set_defaults(**ini_config)
config.update(self.config.__dict__)
# reload the parser with the new arguments, now that we have them
self.config, unknown = self.parser.parse_known_args(config)
logger.warning(f"Ignoring unknown/unused arguments: {unknown}")
self.config, unknown = parser.parse_known_args(unused_args)
if unknown:
logger.warning(f"Ignoring unknown/unused arguments: {unknown}")
if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)):
logger.info(f"Storing configuration file to {basic_config.config_file}")
store_config(ini_config, basic_config.config_file)
breakpoint()
logger.info(f"FEEDER: {self.config.feeders}")
logger.info(f"ENRICHERS: {self.config.enrichers}")
logger.info(f"ARCHIVERS: {self.config.archivers}")
logger.info(f"DATABASES: {self.config.databases}")
logger.info(f"STORAGES: {self.config.storages}")
logger.info(f"FORMATTER: {self.formatter.name}")
def add_steps_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use')
parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use')
parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use')
parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use')
parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use')
parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use')
def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None):
if not modules:
modules = available_modules(with_manifest=True)
for module in modules:
if not module.configs:
# this module has no configs, don't show anything in the help
# (TODO: do we want to show something about this module though, like a description?)
continue
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
for name, kwargs in module.configs.items():
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
# in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
kwargs.pop('cli_set', None)
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
kwargs['type'] = type(kwargs.get('type', 'str'))
group.add_argument(f"--{module.name}.{name}", **kwargs)
def show_help(self):
# for the help message, we want to load *all* possible modules and show the help
# add configs as arg parser arguments
self.add_steps_args(self.basic_parser)
self.add_module_args(parser=self.basic_parser)
self.basic_parser.print_help()
exit()
def run(self) -> None:
self.setup_parser()
self.setup_basic_parser()
# parse the known arguments for now (basically, we want the config file)
# load the config file to get the list of enabled items
self.config, _ = self.parser.parse_known_args()
basic_config, unused_args = self.basic_parser.parse_known_args()
# if help flag was called, then show the help
if self.config.help:
if basic_config.help:
self.show_help()
# load the config file
config = {}
ini_config = {}
try:
config = read_yaml(self.config.config_file)
ini_config = read_config(basic_config.config_file)
except FileNotFoundError:
if self.config.config_file != DEFAULT_CONFIG_FILE:
logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
if basic_config.config_file != DEFAULT_CONFIG_FILE:
logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
exit()
self.setup_config(config)
self.setup_complete_parser(basic_config, ini_config, unused_args)
breakpoint()
config.parse()

Wyświetl plik

@ -30,7 +30,7 @@ the broader archiving framework.
custom dropins can be created to handle additional websites and passed to the archiver
via the command line using the `--dropins` option (TODO!).
""",
'config': {
'configs': {
"facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
"subtitles": {"default": True, "help": "download subtitles if available"},
"comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
@ -40,7 +40,7 @@ via the command line using the `--dropins` option (TODO!).
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
"cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
}
}

Wyświetl plik

@ -13,7 +13,7 @@
},
"configs": {
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"},
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
"access_token": {"default": None, "help": "twitter API access_token"},