kopia lustrzana https://github.com/bellingcat/auto-archiver
Initial changes to move to '__manifest__' format
rodzic
03f3770223
commit
241b35002c
|
@ -2,6 +2,5 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut
|
|||
|
||||
# need to manually specify due to cyclical deps
|
||||
from .core.orchestrator import ArchivingOrchestrator
|
||||
from .core.config import Config
|
||||
# making accessible directly
|
||||
from .core.metadata import Metadata
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
""" Entry point for the auto_archiver package. """
|
||||
from . import Config
|
||||
from . import ArchivingOrchestrator
|
||||
|
||||
def main():
|
||||
config = Config()
|
||||
config.parse()
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
for r in orchestrator.feed(): pass
|
||||
|
||||
def main():
|
||||
ArchivingOrchestrator().run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -12,5 +12,4 @@ from .instagram_archiver import InstagramArchiver
|
|||
from .instagram_tbot_archiver import InstagramTbotArchiver
|
||||
from .telegram_archiver import TelegramArchiver
|
||||
from .vk_archiver import VkArchiver
|
||||
from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver
|
||||
from .instagram_api_archiver import InstagramAPIArchiver
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
from .generic_archiver import GenericArchiver
|
|
@ -21,110 +21,109 @@ from ..storages import Storage
|
|||
from ..enrichers import Enricher
|
||||
from . import Step
|
||||
from ..utils import update_nested_dict
|
||||
from ..version import __version__
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
configurable_parents = [
|
||||
Feeder,
|
||||
Enricher,
|
||||
Archiver,
|
||||
Database,
|
||||
Storage,
|
||||
Formatter
|
||||
# Util
|
||||
]
|
||||
feeder: Feeder
|
||||
formatter: Formatter
|
||||
archivers: List[Archiver] = field(default_factory=[])
|
||||
enrichers: List[Enricher] = field(default_factory=[])
|
||||
storages: List[Storage] = field(default_factory=[])
|
||||
databases: List[Database] = field(default_factory=[])
|
||||
# @dataclass
|
||||
# class Config:
|
||||
# configurable_parents = [
|
||||
# Feeder,
|
||||
# Enricher,
|
||||
# Archiver,
|
||||
# Database,
|
||||
# Storage,
|
||||
# Formatter
|
||||
# # Util
|
||||
# ]
|
||||
# feeder: Feeder
|
||||
# formatter: Formatter
|
||||
# archivers: List[Archiver] = field(default_factory=[])
|
||||
# enrichers: List[Enricher] = field(default_factory=[])
|
||||
# storages: List[Storage] = field(default_factory=[])
|
||||
# databases: List[Database] = field(default_factory=[])
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.defaults = {}
|
||||
self.cli_ops = {}
|
||||
self.config = {}
|
||||
# def __init__(self) -> None:
|
||||
# self.defaults = {}
|
||||
# self.cli_ops = {}
|
||||
# self.config = {}
|
||||
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
|
||||
"""
|
||||
if yaml_config_filename is provided, the --config argument is ignored,
|
||||
useful for library usage when the config values are preloaded
|
||||
overwrite_configs is a dict that overwrites the yaml file contents
|
||||
"""
|
||||
# 1. parse CLI values
|
||||
if use_cli:
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
|
||||
# """
|
||||
# if yaml_config_filename is provided, the --config argument is ignored,
|
||||
# useful for library usage when the config values are preloaded
|
||||
# overwrite_configs is a dict that overwrites the yaml file contents
|
||||
# """
|
||||
# # 1. parse CLI values
|
||||
# if use_cli:
|
||||
# parser = argparse.ArgumentParser(
|
||||
# # prog = "auto-archiver",
|
||||
# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
# epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
# )
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
|
||||
# parser.add_argument('--version', action='version', version=__version__)
|
||||
|
||||
# Iterate over all step subclasses to gather default configs and CLI arguments
|
||||
for configurable in self.configurable_parents:
|
||||
child: Step
|
||||
for child in configurable.__subclasses__():
|
||||
assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
|
||||
for config, details in child.configs().items():
|
||||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
config_path = f"{child.name}.{config}"
|
||||
# # Iterate over all step subclasses to gather default configs and CLI arguments
|
||||
# for configurable in self.configurable_parents:
|
||||
# child: Step
|
||||
# for child in configurable.__subclasses__():
|
||||
# assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
|
||||
# for config, details in child.configs().items():
|
||||
# assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
# assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
# config_path = f"{child.name}.{config}"
|
||||
|
||||
if use_cli:
|
||||
try:
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
except argparse.ArgumentError:
|
||||
# captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
|
||||
pass
|
||||
# if use_cli:
|
||||
# try:
|
||||
# parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
# except argparse.ArgumentError:
|
||||
# # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
|
||||
# pass
|
||||
|
||||
self.defaults[config_path] = details["default"]
|
||||
if "cli_set" in details:
|
||||
self.cli_ops[config_path] = details["cli_set"]
|
||||
# self.defaults[config_path] = details["default"]
|
||||
# if "cli_set" in details:
|
||||
# self.cli_ops[config_path] = details["cli_set"]
|
||||
|
||||
if use_cli:
|
||||
args = parser.parse_args()
|
||||
yaml_config_filename = yaml_config_filename or getattr(args, "config")
|
||||
else: args = {}
|
||||
# if use_cli:
|
||||
# args = parser.parse_args()
|
||||
# yaml_config_filename = yaml_config_filename or getattr(args, "config")
|
||||
# else: args = {}
|
||||
|
||||
# 2. read YAML config file (or use provided value)
|
||||
self.yaml_config = self.read_yaml(yaml_config_filename)
|
||||
update_nested_dict(self.yaml_config, overwrite_configs)
|
||||
# # 2. read YAML config file (or use provided value)
|
||||
# self.yaml_config = self.read_yaml(yaml_config_filename)
|
||||
# update_nested_dict(self.yaml_config, overwrite_configs)
|
||||
|
||||
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||
self.config = defaultdict(dict)
|
||||
for config_path, default in self.defaults.items():
|
||||
child, config = tuple(config_path.split("."))
|
||||
val = getattr(args, config_path, None)
|
||||
if val is not None and config_path in self.cli_ops:
|
||||
val = self.cli_ops[config_path](val, default)
|
||||
if val is None:
|
||||
val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
|
||||
self.config[child][config] = val
|
||||
self.config = dict(self.config)
|
||||
# # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||
# self.config = defaultdict(dict)
|
||||
# for config_path, default in self.defaults.items():
|
||||
# child, config = tuple(config_path.split("."))
|
||||
# val = getattr(args, config_path, None)
|
||||
# if val is not None and config_path in self.cli_ops:
|
||||
# val = self.cli_ops[config_path](val, default)
|
||||
# if val is None:
|
||||
# val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
|
||||
# self.config[child][config] = val
|
||||
# self.config = dict(self.config)
|
||||
|
||||
# 4. STEPS: read steps and validate they exist
|
||||
steps = self.yaml_config.get("steps", {})
|
||||
assert "archivers" in steps, "your configuration steps are missing the archivers property"
|
||||
assert "storages" in steps, "your configuration steps are missing the storages property"
|
||||
# # 4. STEPS: read steps and validate they exist
|
||||
# steps = self.yaml_config.get("steps", {})
|
||||
# assert "archivers" in steps, "your configuration steps are missing the archivers property"
|
||||
# assert "storages" in steps, "your configuration steps are missing the storages property"
|
||||
|
||||
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
|
||||
self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config)
|
||||
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
||||
self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
|
||||
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
||||
self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
|
||||
# self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
|
||||
# self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config)
|
||||
# self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
||||
# self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
|
||||
# self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
||||
# self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
|
||||
|
||||
logger.info(f"FEEDER: {self.feeder.name}")
|
||||
logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}")
|
||||
logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}")
|
||||
logger.info(f"DATABASES: {[x.name for x in self.databases]}")
|
||||
logger.info(f"STORAGES: {[x.name for x in self.storages]}")
|
||||
logger.info(f"FORMATTER: {self.formatter.name}")
|
||||
# logger.info(f"FEEDER: {self.feeder.name}")
|
||||
# logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}")
|
||||
# logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}")
|
||||
# logger.info(f"DATABASES: {[x.name for x in self.databases]}")
|
||||
# logger.info(f"STORAGES: {[x.name for x in self.storages]}")
|
||||
# logger.info(f"FORMATTER: {self.formatter.name}")
|
||||
|
||||
def read_yaml(self, yaml_filename: str) -> dict:
|
||||
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
||||
return yaml.safe_load(inf)
|
||||
def read_yaml(yaml_filename: str) -> dict:
|
||||
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
||||
return yaml.safe_load(inf)
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
import os
|
||||
from os.path import join, dirname
|
||||
from typing import List
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
_DEFAULT_MANIFEST = {
|
||||
'author': 'Bellingcat',
|
||||
'requires_setup': True,
|
||||
'depends': [],
|
||||
'description': '',
|
||||
'external_dependencies': {},
|
||||
'entry_point': '',
|
||||
'version': '1.0',
|
||||
}
|
||||
|
||||
def load_manifest(self, module):
|
||||
# load the manifest file
|
||||
with open(join(module, MANIFEST_FILE)) as f:
|
||||
manifest = f.read()
|
||||
return manifest
|
||||
|
||||
def available_modules(self, additional_paths: List[str] = []) -> List[dict]:
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
def is_really_module(name):
|
||||
if os.path.isfile(join(name, MANIFEST_FILE)):
|
||||
return True
|
||||
|
||||
default_path = [join(dirname(dirname((__file__))), "modules")]
|
||||
all_modules = []
|
||||
|
||||
for module_folder in default_path + additional_paths:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
for folder in os.listdir(module_folder):
|
||||
possible_module = join(module_folder, folder)
|
||||
if not is_really_module(possible_module):
|
||||
continue
|
||||
# parse manifest and add to list of available modules
|
||||
all_modules.append(possible_module)
|
||||
|
||||
return all_modules
|
|
@ -5,9 +5,13 @@
|
|||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import ast
|
||||
import os
|
||||
from os.path import dirname, join
|
||||
from typing import Generator, Union, List
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
import argparse
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
|
@ -18,27 +22,78 @@ from ..storages import Storage
|
|||
from ..enrichers import Enricher
|
||||
from ..databases import Database
|
||||
from .metadata import Metadata
|
||||
from ..version import __version__
|
||||
from .config import read_yaml
|
||||
from .loader import available_modules, load_manifest
|
||||
|
||||
import tempfile, traceback
|
||||
from loguru import logger
|
||||
|
||||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
class ArchivingOrchestrator:
|
||||
def __init__(self, config) -> None:
|
||||
self.feeder: Feeder = config.feeder
|
||||
self.formatter: Formatter = config.formatter
|
||||
self.enrichers: List[Enricher] = config.enrichers
|
||||
self.archivers: List[Archiver] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
self.storages: List[Storage] = config.storages
|
||||
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||
|
||||
try:
|
||||
for a in self.all_archivers_for_setup(): a.setup()
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
|
||||
self.cleanup()
|
||||
# def __init__(self, config: Config) -> None:
|
||||
# self.feeder: Feeder = config.feeder
|
||||
# self.formatter: Formatter = config.formatter
|
||||
# self.enrichers: List[Enricher] = config.enrichers
|
||||
# self.archivers: List[Archiver] = config.archivers
|
||||
# self.databases: List[Database] = config.databases
|
||||
# self.storages: List[Storage] = config.storages
|
||||
# ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||
|
||||
# try:
|
||||
# for a in self.all_archivers_for_setup(): a.setup()
|
||||
# except (KeyboardInterrupt, Exception) as e:
|
||||
# logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
|
||||
# self.cleanup()
|
||||
|
||||
def setup_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
self.parser = parser
|
||||
|
||||
def setup_config(self):
|
||||
# check what mode we're in
|
||||
# if simple, we'll load just the modules that has requires_setup = False
|
||||
# if full, we'll load all modules
|
||||
if self.config.mode == 'simple':
|
||||
for module in available_modules():
|
||||
# load the module
|
||||
manifest = load_manifest(module)
|
||||
|
||||
|
||||
def run(self) -> None:
|
||||
self.setup_parser()
|
||||
|
||||
# parse the known arguments for now (basically, we want the config file)
|
||||
|
||||
# load the config file to get the list of enabled items
|
||||
self.config, _ = self.parser.parse_known_args()
|
||||
|
||||
# load the config file
|
||||
try:
|
||||
config = read_yaml(self.config.config_file)
|
||||
except FileNotFoundError:
|
||||
if self.settings.config == DEFAULT_CONFIG_FILE:
|
||||
# no config file found, let's do the setup with the default values
|
||||
self.setup_config()
|
||||
else:
|
||||
logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
|
||||
breakpoint()
|
||||
config.parse()
|
||||
|
||||
|
||||
for item in self.feed():
|
||||
pass
|
||||
|
||||
def cleanup(self)->None:
|
||||
logger.info("Cleaning up")
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
{
|
||||
'name': 'Generic Archiver',
|
||||
'name': 'Generic Extractor',
|
||||
'version': '0.1.0',
|
||||
'author': 'Bellingcat',
|
||||
'type': ['archiver'],
|
||||
'type': ['extractor'],
|
||||
'entry_point': 'generic_extractor:GenericExtractor',
|
||||
'requires_setup': False,
|
||||
'depends': ['core'],
|
||||
'external_dependencies': {
|
||||
'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
|
||||
},
|
||||
'description': """
|
||||
This is the generic archiver used by auto-archiver, which uses `yt-dlp` under the hood.
|
||||
This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.
|
||||
|
||||
This module is responsible for downloading and processing media content from platforms
|
||||
supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality
|
|
@ -8,7 +8,7 @@ from loguru import logger
|
|||
from auto_archiver.archivers.archiver import Archiver
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
|
||||
class GenericArchiver(Archiver):
|
||||
class GenericExtractor(Archiver):
|
||||
name = "youtubedl_archiver" #left as is for backwards compat
|
||||
_dropins = {}
|
||||
|
|
@ -6,13 +6,13 @@ from os.path import dirname
|
|||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.archivers.generic_archiver import GenericArchiver
|
||||
from auto_archiver.archivers.generic_extractor.generic_extractor import GenericExtractor
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
|
||||
class TestGenericArchiver(TestArchiverBase):
|
||||
class TestGenericExtractor(TestArchiverBase):
|
||||
"""Tests Base Archiver
|
||||
"""
|
||||
archiver_class = GenericArchiver
|
||||
archiver_class = GenericExtractor
|
||||
config = {
|
||||
'subtitles': False,
|
||||
'comments': False,
|
||||
|
|
Ładowanie…
Reference in New Issue