kopia lustrzana https://github.com/bellingcat/auto-archiver
Allow framework for config settings that should not be stored in config (e.g. cli_feeder.urls
Use 'do_not_store': True in the config settings to apply this. Also: fix up generic archiver dropins loading + local_storage defaults (same as what's in example orchestration)pull/224/head
rodzic
57b3bec935
commit
1d2a1d4db7
|
@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
|
|||
from copy import deepcopy
|
||||
from .module import MODULE_TYPES
|
||||
|
||||
from typing import Any, List, Type
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
yaml = YAML()
|
||||
|
||||
|
@ -101,6 +101,15 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
|||
|
||||
return config
|
||||
|
||||
def store_yaml(config: CommentedMap, yaml_filename: str):
|
||||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||
|
||||
|
||||
def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None:
|
||||
config_to_save = deepcopy(config)
|
||||
|
||||
for key1, key2 in do_not_store_keys:
|
||||
if key1 in config_to_save and key2 in config_to_save[key1]:
|
||||
del config_to_save[key1][key2]
|
||||
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
yaml.dump(config, outf)
|
||||
yaml.dump(config_to_save, outf)
|
|
@ -39,20 +39,7 @@ class UniqueAppendAction(argparse.Action):
|
|||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
# def __init__(self, config: Config) -> None:
|
||||
# self.feeder: Feeder = config.feeder
|
||||
# self.formatter: Formatter = config.formatter
|
||||
# self.enrichers: List[Enricher] = config.enrichers
|
||||
# self.archivers: List[Archiver] = config.archivers
|
||||
# self.databases: List[Database] = config.databases
|
||||
# self.storages: List[Storage] = config.storages
|
||||
# ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||
|
||||
# try:
|
||||
# for a in self.all_archivers_for_setup(): a.setup()
|
||||
# except (KeyboardInterrupt, Exception) as e:
|
||||
# logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
|
||||
# self.cleanup()
|
||||
_do_not_store_keys = []
|
||||
|
||||
def setup_basic_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
|
@ -125,10 +112,10 @@ class ArchivingOrchestrator:
|
|||
|
||||
if unknown:
|
||||
logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
|
||||
|
||||
|
||||
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
|
||||
logger.info(f"Storing configuration file to {basic_config.config_file}")
|
||||
store_yaml(self.config, basic_config.config_file)
|
||||
store_yaml(self.config, basic_config.config_file, self._do_not_store_keys)
|
||||
|
||||
return self.config
|
||||
|
||||
|
@ -167,6 +154,10 @@ class ArchivingOrchestrator:
|
|||
for name, kwargs in module.configs.items():
|
||||
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
|
||||
# in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
|
||||
do_not_store = kwargs.pop('do_not_store', False)
|
||||
if do_not_store:
|
||||
self._do_not_store_keys.append((module.name, name))
|
||||
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
|
@ -193,7 +184,7 @@ class ArchivingOrchestrator:
|
|||
logging_config = self.config['logging']
|
||||
logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file, rotation=logging_config['logging.rotation'])
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
|
||||
|
||||
def install_modules(self):
|
||||
|
@ -221,7 +212,14 @@ class ArchivingOrchestrator:
|
|||
if module in invalid_modules:
|
||||
continue
|
||||
loaded_module: BaseModule = get_module(module).load()
|
||||
loaded_module.setup(self.config)
|
||||
try:
|
||||
loaded_module.setup(self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor':
|
||||
loaded_module.cleanup()
|
||||
exit()
|
||||
|
||||
if not loaded_module:
|
||||
invalid_modules.append(module)
|
||||
continue
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"nargs": "+",
|
||||
"required": True,
|
||||
"do_not_store": True,
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -121,6 +121,7 @@ class GenericExtractor(Extractor):
|
|||
|
||||
ie_instance = info_extractor(downloader=ydl)
|
||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||
|
||||
if not dropin:
|
||||
# TODO: add a proper link to 'how to create your own dropin'
|
||||
logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
|
||||
|
@ -172,6 +173,8 @@ class GenericExtractor(Extractor):
|
|||
|
||||
def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
|
||||
|
||||
dropin_name = dropin_name.lower()
|
||||
|
||||
if dropin_name == "generic":
|
||||
# no need for a dropin for the generic extractor (?)
|
||||
return None
|
||||
|
|
|
@ -7,16 +7,16 @@
|
|||
},
|
||||
"configs": {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"default": "flat",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
"choices": ["flat", "url", "random"],
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
},
|
||||
"description": """
|
||||
|
|
Ładowanie…
Reference in New Issue