kopia lustrzana https://github.com/bellingcat/auto-archiver
				
				
				
			Further tweaks based on __manifest__.py files
Loading configs now workspull/183/head
							rodzic
							
								
									7b3a1468cd
								
							
						
					
					
						commit
						54995ad6ab
					
				| 
						 | 
				
			
			@ -1043,7 +1043,7 @@ version = "3.0.0"
 | 
			
		|||
description = "Python port of markdown-it. Markdown parsing, done right!"
 | 
			
		||||
optional = false
 | 
			
		||||
python-versions = ">=3.8"
 | 
			
		||||
groups = ["docs"]
 | 
			
		||||
groups = ["main", "docs"]
 | 
			
		||||
files = [
 | 
			
		||||
    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
 | 
			
		||||
    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
 | 
			
		||||
| 
						 | 
				
			
			@ -1179,7 +1179,7 @@ version = "0.1.2"
 | 
			
		|||
description = "Markdown URL utilities"
 | 
			
		||||
optional = false
 | 
			
		||||
python-versions = ">=3.7"
 | 
			
		||||
groups = ["docs"]
 | 
			
		||||
groups = ["main", "docs"]
 | 
			
		||||
files = [
 | 
			
		||||
    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
 | 
			
		||||
    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
 | 
			
		||||
| 
						 | 
				
			
			@ -1654,7 +1654,7 @@ version = "2.19.1"
 | 
			
		|||
description = "Pygments is a syntax highlighting package written in Python."
 | 
			
		||||
optional = false
 | 
			
		||||
python-versions = ">=3.8"
 | 
			
		||||
groups = ["docs"]
 | 
			
		||||
groups = ["main", "docs"]
 | 
			
		||||
files = [
 | 
			
		||||
    {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
 | 
			
		||||
    {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
 | 
			
		||||
| 
						 | 
				
			
			@ -2031,6 +2031,41 @@ files = [
 | 
			
		|||
[package.dependencies]
 | 
			
		||||
six = ">=1.7.0"
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "rich"
 | 
			
		||||
version = "13.9.4"
 | 
			
		||||
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
 | 
			
		||||
optional = false
 | 
			
		||||
python-versions = ">=3.8.0"
 | 
			
		||||
groups = ["main"]
 | 
			
		||||
files = [
 | 
			
		||||
    {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
 | 
			
		||||
    {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[package.dependencies]
 | 
			
		||||
markdown-it-py = ">=2.2.0"
 | 
			
		||||
pygments = ">=2.13.0,<3.0.0"
 | 
			
		||||
typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
 | 
			
		||||
 | 
			
		||||
[package.extras]
 | 
			
		||||
jupyter = ["ipywidgets (>=7.5.1,<9)"]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "rich-argparse"
 | 
			
		||||
version = "1.6.0"
 | 
			
		||||
description = "Rich help formatters for argparse and optparse"
 | 
			
		||||
optional = false
 | 
			
		||||
python-versions = ">=3.8"
 | 
			
		||||
groups = ["main"]
 | 
			
		||||
files = [
 | 
			
		||||
    {file = "rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7"},
 | 
			
		||||
    {file = "rich_argparse-1.6.0.tar.gz", hash = "sha256:092083c30da186f25bcdff8b1d47fdfb571288510fb051e0488a72cc3128de13"},
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[package.dependencies]
 | 
			
		||||
rich = ">=11.0.0"
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "rsa"
 | 
			
		||||
version = "4.9"
 | 
			
		||||
| 
						 | 
				
			
			@ -2966,4 +3001,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
 | 
			
		|||
[metadata]
 | 
			
		||||
lock-version = "2.1"
 | 
			
		||||
python-versions = ">=3.10,<3.13"
 | 
			
		||||
content-hash = "462c7c5f9d1fbae895d6299ba0b690b6e24d0655a4c9fc79f75ddef4eec222f8"
 | 
			
		||||
content-hash = "911543169cbd6c68ab3392a052ea58917539acdfbc6511e591f8a2b497443cdc"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -56,6 +56,7 @@ dependencies = [
 | 
			
		|||
    "retrying (>=0.0.0)",
 | 
			
		||||
    "tsp-client (>=0.0.0)",
 | 
			
		||||
    "certvalidator (>=0.0.0)",
 | 
			
		||||
    "rich-argparse (>=1.6.0,<2.0.0)",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.dev.dependencies]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,14 +4,11 @@ It supports CLI argument parsing, loading from YAML file, and overrides to allow
 | 
			
		|||
flexible setup in various environments.
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import yaml
 | 
			
		||||
from configparser import ConfigParser
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# @dataclass
 | 
			
		||||
# class Config:
 | 
			
		||||
#     configurable_parents = [
 | 
			
		||||
#         Feeder,
 | 
			
		||||
#         Enricher,
 | 
			
		||||
| 
						 | 
				
			
			@ -50,21 +47,6 @@ from dataclasses import dataclass, field
 | 
			
		|||
        #     parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
 | 
			
		||||
        #     parser.add_argument('--version', action='version', version=__version__)
 | 
			
		||||
 | 
			
		||||
def format_config(config: dict) -> dict:
 | 
			
		||||
    # Iterate over all step subclasses to gather default configs and CLI arguments
 | 
			
		||||
    new_config = {}
 | 
			
		||||
    for step, values in config['steps'].items():
 | 
			
		||||
        new_config[f"--{step}"] = values
 | 
			
		||||
    
 | 
			
		||||
    # format configurations
 | 
			
		||||
    for name, confg_vals in config['configurations'].items():
 | 
			
		||||
        for key, value in confg_vals.items():
 | 
			
		||||
            assert "." not in key, "config key cannot contain '.'"
 | 
			
		||||
            config_path = f"--{name}.{key}"
 | 
			
		||||
            new_config[config_path] = value
 | 
			
		||||
 | 
			
		||||
    return new_config
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LoadFromFile (argparse.Action):
 | 
			
		||||
    def __call__ (self, parser, namespace, values, option_string = None):
 | 
			
		||||
| 
						 | 
				
			
			@ -72,6 +54,14 @@ class LoadFromFile (argparse.Action):
 | 
			
		|||
            # parse arguments in the file and store them in the target namespace
 | 
			
		||||
            parser.parse_args(f.read().split(), namespace)
 | 
			
		||||
 | 
			
		||||
def read_yaml(yaml_filename: str) -> dict:
 | 
			
		||||
    with open(yaml_filename, "r", encoding="utf-8") as inf:
 | 
			
		||||
        return format_config(yaml.safe_load(inf))
 | 
			
		||||
def read_config(config_filename: str) -> dict:
 | 
			
		||||
    config = ConfigParser()
 | 
			
		||||
    config.read(config_filename)
 | 
			
		||||
    # setup basic format
 | 
			
		||||
    if 'STEPS' not in config.sections():
 | 
			
		||||
        config.add_section("STEPS")
 | 
			
		||||
    return config
 | 
			
		||||
 | 
			
		||||
def store_config(config: ConfigParser, config_filename: str):
 | 
			
		||||
    with open(config_filename, "w", encoding="utf-8") as outf:
 | 
			
		||||
        config.write(outf)
 | 
			
		||||
| 
						 | 
				
			
			@ -1,11 +1,23 @@
 | 
			
		|||
import ast
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
import os
 | 
			
		||||
import copy
 | 
			
		||||
from os.path import join, dirname
 | 
			
		||||
from typing import List
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
MODULE_TYPES = [
 | 
			
		||||
    'feeder',
 | 
			
		||||
    'enricher',
 | 
			
		||||
    'archiver',
 | 
			
		||||
    'database',
 | 
			
		||||
    'storage',
 | 
			
		||||
    'formatter'
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
MANIFEST_FILE = "__manifest__.py"
 | 
			
		||||
_DEFAULT_MANIFEST = {
 | 
			
		||||
    'name': '',
 | 
			
		||||
    'author': 'Bellingcat',
 | 
			
		||||
    'requires_setup': True,
 | 
			
		||||
    'depends': [],
 | 
			
		||||
| 
						 | 
				
			
			@ -13,20 +25,54 @@ _DEFAULT_MANIFEST = {
 | 
			
		|||
    'external_dependencies': {},
 | 
			
		||||
    'entry_point': '',
 | 
			
		||||
    'version': '1.0',
 | 
			
		||||
    'config': {}
 | 
			
		||||
    'configs': {}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
def load_manifest(module):
 | 
			
		||||
@dataclass
 | 
			
		||||
class Module:
 | 
			
		||||
    name: str
 | 
			
		||||
    display_name: str
 | 
			
		||||
    type: list
 | 
			
		||||
    entry_point: str
 | 
			
		||||
    depends: list
 | 
			
		||||
    external_dependencies: dict
 | 
			
		||||
    requires_setup: bool
 | 
			
		||||
    configs: dict
 | 
			
		||||
    description: str
 | 
			
		||||
    path: str
 | 
			
		||||
    manifest: dict
 | 
			
		||||
 | 
			
		||||
    def __init__(self, module_name, path, manifest):
 | 
			
		||||
        self.name = module_name
 | 
			
		||||
        self.path = path
 | 
			
		||||
        self.manifest = manifest
 | 
			
		||||
        if manifest:
 | 
			
		||||
            self.display_name = manifest['name']
 | 
			
		||||
            self.type = manifest['type']
 | 
			
		||||
            self.entry_point = manifest['entry_point']
 | 
			
		||||
            self.depends = manifest['depends']
 | 
			
		||||
            self.external_dependencies = manifest['external_dependencies']
 | 
			
		||||
            self.requires_setup = manifest['requires_setup']
 | 
			
		||||
            self.configs = manifest['configs']
 | 
			
		||||
            self.description = manifest['description']
 | 
			
		||||
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return f"Module<'{self.display_name}' ({self.name})>"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_manifest(module_path):
 | 
			
		||||
    print(f"Loading manifest for module {module_path}")
 | 
			
		||||
    # load the manifest file
 | 
			
		||||
    manifest = copy.deepcopy(_DEFAULT_MANIFEST)
 | 
			
		||||
 | 
			
		||||
    with open(join(module, MANIFEST_FILE)) as f:
 | 
			
		||||
    with open(join(module_path, MANIFEST_FILE)) as f:
 | 
			
		||||
        manifest.update(ast.literal_eval(f.read()))
 | 
			
		||||
    return manifest
 | 
			
		||||
 | 
			
		||||
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]:
 | 
			
		||||
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]:
 | 
			
		||||
    # search through all valid 'modules' paths. Default is 'modules' in the current directory
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    # see odoo/modules/module.py -> get_modules
 | 
			
		||||
    def is_really_module(name):
 | 
			
		||||
        if os.path.isfile(join(name, MANIFEST_FILE)):
 | 
			
		||||
| 
						 | 
				
			
			@ -46,6 +92,6 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals
 | 
			
		|||
                manifest = load_manifest(possible_module_path)
 | 
			
		||||
            else:
 | 
			
		||||
                manifest = {}
 | 
			
		||||
            all_modules.append((possible_module, possible_module_path, manifest))
 | 
			
		||||
            all_modules.append(Module(possible_module, possible_module_path, manifest))
 | 
			
		||||
 | 
			
		||||
    return all_modules
 | 
			
		||||
| 
						 | 
				
			
			@ -9,6 +9,11 @@ from typing import Generator, Union, List
 | 
			
		|||
from urllib.parse import urlparse
 | 
			
		||||
from ipaddress import ip_address
 | 
			
		||||
import argparse
 | 
			
		||||
import configparser
 | 
			
		||||
import os
 | 
			
		||||
from os.path import join, dirname
 | 
			
		||||
 | 
			
		||||
from rich_argparse import RichHelpFormatter
 | 
			
		||||
 | 
			
		||||
from .context import ArchivingContext
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -20,14 +25,15 @@ from ..enrichers import Enricher
 | 
			
		|||
from ..databases import Database
 | 
			
		||||
from .metadata import Metadata
 | 
			
		||||
from ..version import __version__
 | 
			
		||||
from .config import read_yaml
 | 
			
		||||
from .loader import available_modules, load_manifest
 | 
			
		||||
from .config import read_config, store_config
 | 
			
		||||
from .loader import available_modules, Module, MODULE_TYPES
 | 
			
		||||
 | 
			
		||||
import tempfile, traceback
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
 | 
			
		||||
 | 
			
		||||
class ArchivingOrchestrator:
 | 
			
		||||
 | 
			
		||||
    # def __init__(self, config: Config) -> None:
 | 
			
		||||
| 
						 | 
				
			
			@ -45,95 +51,145 @@ class ArchivingOrchestrator:
 | 
			
		|||
    #         logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
 | 
			
		||||
    #         self.cleanup()
 | 
			
		||||
 | 
			
		||||
    def setup_parser(self):
 | 
			
		||||
    def setup_basic_parser(self):
 | 
			
		||||
        parser = argparse.ArgumentParser(
 | 
			
		||||
                # prog = "auto-archiver",
 | 
			
		||||
                add_help=False,
 | 
			
		||||
                description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
 | 
			
		||||
                epilog="Check the code at https://github.com/bellingcat/auto-archiver"
 | 
			
		||||
                description="""
 | 
			
		||||
                Auto Archiver is a CLI tool to archive media/metadata from online URLs;
 | 
			
		||||
                it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
 | 
			
		||||
                """,
 | 
			
		||||
                epilog="Check the code at https://github.com/bellingcat/auto-archiver",
 | 
			
		||||
                formatter_class=RichHelpFormatter,
 | 
			
		||||
        )
 | 
			
		||||
        parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
 | 
			
		||||
        parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
 | 
			
		||||
        parser.add_argument('--version', action='version', version=__version__)
 | 
			
		||||
        parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
 | 
			
		||||
        # override the default 'help' so we can inject all the configs and show those
 | 
			
		||||
        parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
 | 
			
		||||
        self.parser = parser
 | 
			
		||||
    
 | 
			
		||||
    def add_module_args(self, modules: list = None):
 | 
			
		||||
        if not modules:
 | 
			
		||||
            modules = available_modules(with_manifest=True)
 | 
			
		||||
        parser.add_argument('-s', '--store', action='store_true', dest='store', help='Store the created config in the config file')
 | 
			
		||||
        self.basic_parser = parser
 | 
			
		||||
 | 
			
		||||
        for module_name, module_path, manifest in modules:
 | 
			
		||||
            for name, kwargs in manifest['config'].items():
 | 
			
		||||
                kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}"
 | 
			
		||||
                self.parser.add_argument(f"--{module_name}.{name}", **kwargs)
 | 
			
		||||
    def setup_complete_parser(self, basic_config: dict, ini_config: dict, unused_args: list[str]) -> None:
 | 
			
		||||
        parser = argparse.ArgumentParser(
 | 
			
		||||
            parents = [self.basic_parser],
 | 
			
		||||
            add_help=False,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def show_help(self):
 | 
			
		||||
        # for the help message, we want to load *all* possible modules and show the help
 | 
			
		||||
            # add configs as arg parser arguments
 | 
			
		||||
        self.add_module_args()
 | 
			
		||||
 | 
			
		||||
        self.parser.print_help()
 | 
			
		||||
        exit()
 | 
			
		||||
 | 
			
		||||
    def setup_config(self, config: dict) -> None:
 | 
			
		||||
        # check what mode we're in
 | 
			
		||||
        # if we have a config file, use that to decide which modules to load
 | 
			
		||||
        # if simple, we'll load just the modules that has requires_setup = False
 | 
			
		||||
        # if full, we'll load all modules
 | 
			
		||||
        if self.config.mode == 'simple':
 | 
			
		||||
            simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']]
 | 
			
		||||
            self.add_module_args(simple_modules)
 | 
			
		||||
        if ini_config:
 | 
			
		||||
            # only load the modules enabled in config
 | 
			
		||||
            enabled_modules = []
 | 
			
		||||
            for module_type in MODULE_TYPES:
 | 
			
		||||
                try:
 | 
			
		||||
                    enabled_modules.extend(ini_config.get("STEPS", module_type))
 | 
			
		||||
                except configparser.NoOptionError:
 | 
			
		||||
                    pass
 | 
			
		||||
 | 
			
		||||
        # now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required"
 | 
			
		||||
        self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use')
 | 
			
		||||
        self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use')
 | 
			
		||||
        self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use')
 | 
			
		||||
        self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use')
 | 
			
		||||
        self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use')
 | 
			
		||||
        self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use')
 | 
			
		||||
            # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
 | 
			
		||||
            for module_type in MODULE_TYPES:
 | 
			
		||||
                if modules := getattr(basic_config, f"{module_type}s", []):
 | 
			
		||||
                    enabled_modules.extend(modules)
 | 
			
		||||
 | 
			
		||||
            self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser)
 | 
			
		||||
        elif basic_config.mode == 'simple':
 | 
			
		||||
            simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
 | 
			
		||||
            self.add_module_args(simple_modules, parser)
 | 
			
		||||
            # add them to the config
 | 
			
		||||
            for module in simple_modules:
 | 
			
		||||
                for module_type in module.type:
 | 
			
		||||
                    existing_modules = config['STEPS'] = module.name
 | 
			
		||||
                    ini_config.setdefault(f"{module_type}s", []).append(module.name)
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            # load all modules, they're not using the 'simple' mode
 | 
			
		||||
            self.add_module_args(available_modules(with_manifest=True), parser)
 | 
			
		||||
 | 
			
		||||
        parser.set_defaults(**ini_config)
 | 
			
		||||
 | 
			
		||||
        
 | 
			
		||||
        config.update(self.config.__dict__)
 | 
			
		||||
        # reload the parser with the new arguments, now that we have them
 | 
			
		||||
        self.config, unknown = self.parser.parse_known_args(config)
 | 
			
		||||
        logger.warning(f"Ignoring unknown/unused arguments: {unknown}")
 | 
			
		||||
        self.config, unknown = parser.parse_known_args(unused_args)
 | 
			
		||||
        if unknown:
 | 
			
		||||
            logger.warning(f"Ignoring unknown/unused arguments: {unknown}")
 | 
			
		||||
 | 
			
		||||
        if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)):
 | 
			
		||||
            logger.info(f"Storing configuration file to {basic_config.config_file}")
 | 
			
		||||
            store_config(ini_config, basic_config.config_file)
 | 
			
		||||
        breakpoint()
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        logger.info(f"FEEDER: {self.config.feeders}")
 | 
			
		||||
        logger.info(f"ENRICHERS: {self.config.enrichers}")
 | 
			
		||||
        logger.info(f"ARCHIVERS: {self.config.archivers}")
 | 
			
		||||
        logger.info(f"DATABASES: {self.config.databases}")
 | 
			
		||||
        logger.info(f"STORAGES: {self.config.storages}")
 | 
			
		||||
        logger.info(f"FORMATTER: {self.formatter.name}")
 | 
			
		||||
        
 | 
			
		||||
    
 | 
			
		||||
    def add_steps_args(self, parser: argparse.ArgumentParser = None):
 | 
			
		||||
        if not parser:
 | 
			
		||||
            parser = self.parser
 | 
			
		||||
 | 
			
		||||
        parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use')
 | 
			
		||||
        parser.add_argument('--enrichers', action='store', dest='enrichers',  nargs='+', required=True, help='the enrichers to use')
 | 
			
		||||
        parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use')
 | 
			
		||||
        parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use')
 | 
			
		||||
        parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use')
 | 
			
		||||
        parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use')
 | 
			
		||||
 | 
			
		||||
    def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None):
 | 
			
		||||
 | 
			
		||||
        if not modules:
 | 
			
		||||
            modules = available_modules(with_manifest=True)
 | 
			
		||||
 | 
			
		||||
        for module in modules:
 | 
			
		||||
            if not module.configs:
 | 
			
		||||
                # this module has no configs, don't show anything in the help
 | 
			
		||||
                # (TODO: do we want to show something about this module though, like a description?)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
 | 
			
		||||
            for name, kwargs in module.configs.items():
 | 
			
		||||
                # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
 | 
			
		||||
                # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
 | 
			
		||||
                kwargs.pop('cli_set', None)
 | 
			
		||||
                kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
 | 
			
		||||
                kwargs['type'] = type(kwargs.get('type', 'str'))
 | 
			
		||||
                group.add_argument(f"--{module.name}.{name}", **kwargs)
 | 
			
		||||
 | 
			
		||||
    def show_help(self):
 | 
			
		||||
        # for the help message, we want to load *all* possible modules and show the help
 | 
			
		||||
            # add configs as arg parser arguments
 | 
			
		||||
        
 | 
			
		||||
        self.add_steps_args(self.basic_parser)
 | 
			
		||||
        self.add_module_args(parser=self.basic_parser)
 | 
			
		||||
 | 
			
		||||
        self.basic_parser.print_help()
 | 
			
		||||
        exit()
 | 
			
		||||
 | 
			
		||||
    def run(self) -> None:
 | 
			
		||||
        self.setup_parser()
 | 
			
		||||
        self.setup_basic_parser()
 | 
			
		||||
 | 
			
		||||
        # parse the known arguments for now (basically, we want the config file)
 | 
			
		||||
 | 
			
		||||
        # load the config file to get the list of enabled items
 | 
			
		||||
        self.config, _ = self.parser.parse_known_args()
 | 
			
		||||
        basic_config, unused_args = self.basic_parser.parse_known_args()
 | 
			
		||||
 | 
			
		||||
        # if help flag was called, then show the help
 | 
			
		||||
        if self.config.help:
 | 
			
		||||
        if basic_config.help:
 | 
			
		||||
            self.show_help()
 | 
			
		||||
 | 
			
		||||
        # load the config file
 | 
			
		||||
        config = {}
 | 
			
		||||
        ini_config = {}
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            config = read_yaml(self.config.config_file)
 | 
			
		||||
            ini_config = read_config(basic_config.config_file)
 | 
			
		||||
        except FileNotFoundError:
 | 
			
		||||
            if self.config.config_file != DEFAULT_CONFIG_FILE:
 | 
			
		||||
                logger.error(f"The configuration file {self.config.config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
 | 
			
		||||
            if basic_config.config_file != DEFAULT_CONFIG_FILE:
 | 
			
		||||
                logger.error(f"The configuration file {basic_config.config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
 | 
			
		||||
                exit()
 | 
			
		||||
 | 
			
		||||
        self.setup_config(config)
 | 
			
		||||
        self.setup_complete_parser(basic_config, ini_config, unused_args)
 | 
			
		||||
 | 
			
		||||
        breakpoint()
 | 
			
		||||
        config.parse()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -30,7 +30,7 @@ the broader archiving framework.
 | 
			
		|||
custom dropins can be created to handle additional websites and passed to the archiver
 | 
			
		||||
via the command line using the `--dropins` option (TODO!).
 | 
			
		||||
""",
 | 
			
		||||
    'config': {
 | 
			
		||||
    'configs': {
 | 
			
		||||
            "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
 | 
			
		||||
            "subtitles": {"default": True, "help": "download subtitles if available"},
 | 
			
		||||
            "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
 | 
			
		||||
| 
						 | 
				
			
			@ -40,7 +40,7 @@ via the command line using the `--dropins` option (TODO!).
 | 
			
		|||
            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
 | 
			
		||||
            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
 | 
			
		||||
            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
 | 
			
		||||
            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
 | 
			
		||||
            "cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
 | 
			
		||||
            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -13,7 +13,7 @@
 | 
			
		|||
    },
 | 
			
		||||
    "configs": {
 | 
			
		||||
            "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
 | 
			
		||||
            "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
 | 
			
		||||
            "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"},
 | 
			
		||||
            "consumer_key": {"default": None, "help": "twitter API consumer_key"},
 | 
			
		||||
            "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
 | 
			
		||||
            "access_token": {"default": None, "help": "twitter API access_token"},
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Ładowanie…
	
		Reference in New Issue