kopia lustrzana https://github.com/bellingcat/auto-archiver
Further cleanup
* Removes (partly) the ArchivingOrchestrator * Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com * More unit tests * Improved error handlingpull/189/head
rodzic
953011f368
commit
d6b4b7a932
|
@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
|||
import sys
|
||||
|
||||
def main():
|
||||
ArchivingOrchestrator().run(sys.argv)
|
||||
ArchivingOrchestrator().run(sys.argv[1:])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from typing import Mapping, Any
|
||||
from abc import ABC
|
||||
from copy import deepcopy, copy
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from loguru import logger
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
"""
|
||||
Base module class. All modules should inherit from this class.
|
||||
|
||||
The exact methods a class implements will depend on the type of module it is,
|
||||
however all modules have a .setup(config: dict) method to run any setup code
|
||||
(e.g. logging in to a site, spinning up a browser etc.)
|
||||
|
||||
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
||||
a subclass can be of multiple types. For example, a module that extracts data from
|
||||
a website and stores it in a database would be both an 'extractor' and a 'database' module.
|
||||
|
||||
Each module is a python package, and should have a __manifest__.py file in the
|
||||
same directory as the module file. The __manifest__.py specifies the module information
|
||||
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
|
||||
default manifest structure.
|
||||
|
||||
"""
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
config: Mapping[str, Any]
|
||||
authentication: Mapping[str, Mapping[str, str]]
|
||||
name: str
|
||||
|
||||
# this is set by the orchestrator prior to archiving
|
||||
tmp_dir: TemporaryDirectory = None
|
||||
|
||||
def setup(self, config: dict):
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
# extract out contatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
# this is important. Each instance is given its own deepcopied config, so modules cannot
|
||||
# change values to affect other modules
|
||||
config = deepcopy(config)
|
||||
authentication = deepcopy(config.pop('authentication', {}))
|
||||
|
||||
self.authentication = authentication
|
||||
self.config = config
|
||||
for key, val in config.get(self.name, {}).items():
|
||||
setattr(self, key, val)
|
||||
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
|
||||
def auth_for_site(self, site: str) -> dict:
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now, just hard code those.
|
||||
|
||||
# SECURITY: parse the domain using urllib
|
||||
site = urlparse(site).netloc
|
||||
# add the 'www' version of the site to the list of sites to check
|
||||
for to_try in [site, f"www.{site}"]:
|
||||
if to_try in self.authentication:
|
||||
return self.authentication[to_try]
|
||||
|
||||
# do a fuzzy string match just to print a warning - don't use it since it's insecure
|
||||
for key in self.authentication.keys():
|
||||
if key in site or site in key:
|
||||
logger.warning(f"Could not find exact authentication information for site '{site}'. \
|
||||
did find information for '{key}' which is close, is this what you meant? \
|
||||
If so, edit your authentication settings to make sure it exactly matches.")
|
||||
|
||||
return {}
|
|
@ -15,8 +15,14 @@ from .module import BaseModule
|
|||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
yaml = YAML()
|
||||
yaml: YAML = YAML()
|
||||
|
||||
b = yaml.load("""
|
||||
# This is a comment
|
||||
site.com,site2.com:
|
||||
key: value
|
||||
key2: value2
|
||||
""")
|
||||
EMPTY_CONFIG = yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
@ -25,6 +31,24 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES
|
|||
"""
|
||||
|
||||
# Global configuration
|
||||
|
||||
# Authentication
|
||||
# a dictionary of authentication information that can be used by extractors to login to website.
|
||||
# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
|
||||
# Common login 'types' are username/password, cookie, api key/token.
|
||||
# Some Examples:
|
||||
# facebook.com:
|
||||
# username: "my_username"
|
||||
# password: "my_password"
|
||||
# or for a site that uses an API key:
|
||||
# twitter.com,x.com:
|
||||
# api_key
|
||||
# api_secret
|
||||
# youtube.com:
|
||||
# cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
|
||||
|
||||
authentication: {}
|
||||
|
||||
# These are the global configurations that are used by the modules
|
||||
|
||||
logging:
|
||||
|
@ -136,12 +160,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
|||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||
|
||||
|
||||
def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None:
|
||||
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
config_to_save = deepcopy(config)
|
||||
|
||||
for key1, key2 in do_not_store_keys:
|
||||
if key1 in config_to_save and key2 in config_to_save[key1]:
|
||||
del config_to_save[key1][key2]
|
||||
|
||||
config.pop('urls', None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
yaml.dump(config_to_save, outf)
|
|
@ -53,12 +53,4 @@ class ArchivingContext:
|
|||
if full_reset: ac.keep_on_reset = set()
|
||||
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||
|
||||
# ---- custom getters/setters for widely used context values
|
||||
|
||||
@staticmethod
|
||||
def set_tmp_dir(tmp_dir: str):
|
||||
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
|
||||
|
||||
@staticmethod
|
||||
def get_tmp_dir() -> str:
|
||||
return ArchivingContext.get_instance().configs.get("tmp_dir")
|
||||
# ---- custom getters/setters for widely used context values
|
|
@ -12,7 +12,6 @@ from dataclasses import dataclass
|
|||
import mimetypes
|
||||
import os
|
||||
import mimetypes
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
from retrying import retry
|
||||
|
@ -71,7 +70,7 @@ class Extractor(BaseModule):
|
|||
to_filename = url.split('/')[-1].split('?')[0]
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
|
||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
|
|
|
@ -7,7 +7,6 @@ from __future__ import annotations
|
|||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
from abc import ABC
|
||||
import shutil
|
||||
import ast
|
||||
import copy
|
||||
|
@ -17,63 +16,12 @@ import os
|
|||
from os.path import join, dirname
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
from .base_module import BaseModule
|
||||
|
||||
_LAZY_LOADED_MODULES = {}
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
"""
|
||||
Base module class. All modules should inherit from this class.
|
||||
|
||||
The exact methods a class implements will depend on the type of module it is,
|
||||
however all modules have a .setup(config: dict) method to run any setup code
|
||||
(e.g. logging in to a site, spinning up a browser etc.)
|
||||
|
||||
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
||||
a subclass can be of multiple types. For example, a module that extracts data from
|
||||
a website and stores it in a database would be both an 'extractor' and a 'database' module.
|
||||
|
||||
Each module is a python package, and should have a __manifest__.py file in the
|
||||
same directory as the module file. The __manifest__.py specifies the module information
|
||||
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
|
||||
default manifest structure.
|
||||
|
||||
"""
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
config: dict
|
||||
name: str
|
||||
|
||||
def setup(self, config: dict):
|
||||
self.config = config
|
||||
for key, val in config.get(self.name, {}).items():
|
||||
setattr(self, key, val)
|
||||
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
|
||||
|
||||
def setup_paths(paths: list[str]) -> None:
|
||||
"""
|
||||
|
|
|
@ -5,12 +5,15 @@
|
|||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List
|
||||
from typing import Generator, Union, List, Type
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from tempfile import TemporaryDirectory
|
||||
import traceback
|
||||
|
||||
from rich_argparse import RichHelpFormatter
|
||||
|
||||
|
@ -18,17 +21,46 @@ from .context import ArchivingContext
|
|||
|
||||
from .metadata import Metadata
|
||||
from ..version import __version__
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||
from . import validators
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .module import BaseModule
|
||||
|
||||
import tempfile, traceback
|
||||
from loguru import logger
|
||||
|
||||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
|
||||
class JsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
try:
|
||||
setattr(namespace, self.dest, json.loads(values))
|
||||
except json.JSONDecodeError as e:
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
||||
|
||||
|
||||
class AuthenticationJsonParseAction(JsonParseAction):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
super().__call__(parser, namespace, values, option_string)
|
||||
auth_dict = getattr(namespace, self.dest)
|
||||
if isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
try:
|
||||
with open(auth_dict, 'r') as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = yaml.load(f)
|
||||
except:
|
||||
pass
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
for site, auth in auth_dict.items():
|
||||
if not isinstance(site, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
if not hasattr(namespace, self.dest):
|
||||
|
@ -38,9 +70,7 @@ class UniqueAppendAction(argparse.Action):
|
|||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
_do_not_store_keys = []
|
||||
|
||||
|
||||
def setup_basic_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="auto-archiver",
|
||||
|
@ -52,7 +82,7 @@ class ArchivingOrchestrator:
|
|||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show this help message and exit')
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
|
@ -80,7 +110,6 @@ class ArchivingOrchestrator:
|
|||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for config in [yaml_config['steps'], basic_config.__dict__]:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
|
@ -120,7 +149,7 @@ class ArchivingOrchestrator:
|
|||
|
||||
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
|
||||
logger.info(f"Storing configuration file to {basic_config.config_file}")
|
||||
store_yaml(self.config, basic_config.config_file, self._do_not_store_keys)
|
||||
store_yaml(self.config, basic_config.config_file)
|
||||
|
||||
return self.config
|
||||
|
||||
|
@ -128,18 +157,29 @@ class ArchivingOrchestrator:
|
|||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction)
|
||||
|
||||
# allow passing URLs directly on the command line
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
|
||||
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',\
|
||||
default={},
|
||||
action=AuthenticationJsonParseAction)
|
||||
# logging arguments
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO')
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
|
||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
|
||||
if not modules:
|
||||
|
@ -147,6 +187,7 @@ class ArchivingOrchestrator:
|
|||
|
||||
module: LazyBaseModule
|
||||
for module in modules:
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
|
@ -155,12 +196,6 @@ class ArchivingOrchestrator:
|
|||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
|
||||
for name, kwargs in module.configs.items():
|
||||
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
|
||||
# in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
|
||||
do_not_store = kwargs.pop('do_not_store', False)
|
||||
if do_not_store:
|
||||
self._do_not_store_keys.append((module.name, name))
|
||||
|
||||
if not kwargs.get('metavar', None):
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs['metavar'] = name.upper()
|
||||
|
@ -208,8 +243,7 @@ class ArchivingOrchestrator:
|
|||
step_items = []
|
||||
modules_to_load = self.config['steps'][f"{module_type}s"]
|
||||
|
||||
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} \
|
||||
in your configuration file or on the command line (using --{module_type}s)"
|
||||
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
|
@ -223,12 +257,37 @@ class ArchivingOrchestrator:
|
|||
exit()
|
||||
|
||||
for module in modules_to_load:
|
||||
if module == 'cli_feeder':
|
||||
urls = self.config['urls']
|
||||
if not urls:
|
||||
logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.")
|
||||
self.basic_parser.print_help()
|
||||
exit()
|
||||
# cli_feeder is a pseudo module, it just takes the command line args
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
|
||||
pseudo_module = type('CLIFeeder', (Feeder,), {
|
||||
'name': 'cli_feeder',
|
||||
'display_name': 'CLI Feeder',
|
||||
'__iter__': feed
|
||||
|
||||
})()
|
||||
|
||||
|
||||
pseudo_module.__iter__ = feed
|
||||
step_items.append(pseudo_module)
|
||||
continue
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
try:
|
||||
loaded_module: BaseModule = get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor' and loaded_module.name == module:
|
||||
loaded_module.cleanup()
|
||||
exit()
|
||||
|
@ -285,13 +344,18 @@ class ArchivingOrchestrator:
|
|||
|
||||
def cleanup(self)->None:
|
||||
logger.info("Cleaning up")
|
||||
for e in self.config['steps']['extractors']:
|
||||
for e in self.extractors:
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for feeder in self.config['steps']['feeders']:
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
yield self.feed_item(item)
|
||||
url_count += 1
|
||||
|
||||
logger.success(f"Processed {url_count} URL(s)")
|
||||
self.cleanup()
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
|
@ -300,22 +364,33 @@ class ArchivingOrchestrator:
|
|||
- catches keyboard interruptions to do a clean exit
|
||||
- catches any unexpected error, logs it, and does a clean exit
|
||||
"""
|
||||
tmp_dir: TemporaryDirectory = None
|
||||
try:
|
||||
ArchivingContext.reset()
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
ArchivingContext.set_tmp_dir(tmp_dir)
|
||||
return self.archive(item)
|
||||
tmp_dir = TemporaryDirectory(dir="./")
|
||||
# set tmp_dir on all modules
|
||||
for m in self.all_modules:
|
||||
m.tmp_dir = tmp_dir.name
|
||||
return self.archive(item)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {item=}")
|
||||
for d in self.config['steps']['databases']: d.aborted(item)
|
||||
for d in self.databases:
|
||||
d.aborted(item)
|
||||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
for d in self.config['steps']['databases']:
|
||||
if type(e) == AssertionError: d.failed(item, str(e))
|
||||
else: d.failed(item, reason="unexpected error")
|
||||
for d in self.databases:
|
||||
if type(e) == AssertionError:
|
||||
d.failed(item, str(e))
|
||||
else:
|
||||
d.failed(item, reason="unexpected error")
|
||||
finally:
|
||||
if tmp_dir:
|
||||
# remove the tmp_dir from all modules
|
||||
for m in self.all_modules:
|
||||
m.tmp_dir = None
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
|
@ -328,31 +403,38 @@ class ArchivingOrchestrator:
|
|||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
self.assert_valid_url(original_url)
|
||||
try:
|
||||
self.assert_valid_url(original_url)
|
||||
except AssertionError as e:
|
||||
logger.error(f"Error archiving URL {original_url}: {e}")
|
||||
raise e
|
||||
|
||||
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
|
||||
url = original_url
|
||||
for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url)
|
||||
for a in self.extractors:
|
||||
url = a.sanitize_url(url)
|
||||
|
||||
result.set_url(url)
|
||||
if original_url != url: result.set("original_url", original_url)
|
||||
|
||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||
cached_result = None
|
||||
for d in self.config["steps"]["databases"]:
|
||||
for d in self.databases:
|
||||
d.started(result)
|
||||
if (local_result := d.fetch(result)):
|
||||
cached_result = (cached_result or Metadata()).merge(local_result)
|
||||
if cached_result:
|
||||
logger.debug("Found previously archived entry")
|
||||
for d in self.config["steps"]["databases"]:
|
||||
for d in self.databases:
|
||||
try: d.done(cached_result, cached=True)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
return cached_result
|
||||
|
||||
# 3 - call extractors until one succeeds
|
||||
for a in self.config["steps"]["extractors"]:
|
||||
for a in self.extractors:
|
||||
logger.info(f"Trying extractor {a.name} for {url}")
|
||||
try:
|
||||
result.merge(a.download(result))
|
||||
|
@ -361,7 +443,7 @@ class ArchivingOrchestrator:
|
|||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# 4 - call enrichers to work with archived content
|
||||
for e in self.config["steps"]["enrichers"]:
|
||||
for e in self.enrichers:
|
||||
try: e.enrich(result)
|
||||
except Exception as exc:
|
||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
@ -370,7 +452,7 @@ class ArchivingOrchestrator:
|
|||
result.store()
|
||||
|
||||
# 6 - format and store formatted if needed
|
||||
if final_media := self.config["steps"]["formatters"][0].format(result):
|
||||
if final_media := self.formatters[0].format(result):
|
||||
final_media.store(url=url, metadata=result)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
|
@ -378,7 +460,7 @@ class ArchivingOrchestrator:
|
|||
result.status = "nothing archived"
|
||||
|
||||
# signal completion to databases and archivers
|
||||
for d in self.config["steps"]["databases"]:
|
||||
for d in self.databases:
|
||||
try: d.done(result)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
@ -403,4 +485,44 @@ class ArchivingOrchestrator:
|
|||
assert ip.is_global, f"Invalid IP used"
|
||||
assert not ip.is_reserved, f"Invalid IP used"
|
||||
assert not ip.is_link_local, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@property
|
||||
def feeders(self) -> List[Type[Feeder]]:
|
||||
return self._get_property('feeders')
|
||||
|
||||
@property
|
||||
def extractors(self) -> List[Type[Extractor]]:
|
||||
return self._get_property('extractors')
|
||||
|
||||
@property
|
||||
def enrichers(self) -> List[Type[Enricher]]:
|
||||
return self._get_property('enrichers')
|
||||
|
||||
@property
|
||||
def databases(self) -> List[Type[Database]]:
|
||||
return self._get_property('databases')
|
||||
|
||||
@property
|
||||
def storages(self) -> List[Type[Storage]]:
|
||||
return self._get_property('storages')
|
||||
|
||||
@property
|
||||
def formatters(self) -> List[Type[Formatter]]:
|
||||
return self._get_property('formatters')
|
||||
|
||||
@property
|
||||
def all_modules(self) -> List[Type[BaseModule]]:
|
||||
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
||||
|
||||
def _get_property(self, prop):
|
||||
try:
|
||||
f = self.config['steps'][prop]
|
||||
if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)):
|
||||
raise TypeError
|
||||
return f
|
||||
except:
|
||||
exit("Property called prior to full initialisation")
|
|
@ -0,0 +1,40 @@
|
|||
from loguru import logger
|
||||
import time, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from ..utils import Webdriver, UrlUtil, random_str
|
||||
from ..core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
|
@ -0,0 +1,38 @@
|
|||
from loguru import logger
|
||||
import csv
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import url_or_none
|
||||
|
||||
class CSVFeeder(Feeder):
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"files": {
|
||||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
url_column = self.column or 0
|
||||
for file in self.files:
|
||||
with open(file, "r") as f:
|
||||
reader = csv.reader(f)
|
||||
first_row = next(reader)
|
||||
if not(url_or_none(first_row[url_column])):
|
||||
# it's a header row, skip it
|
||||
for row in reader:
|
||||
url = row[0]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
|
@ -40,5 +40,3 @@ class AtlosFeeder(Feeder):
|
|||
|
||||
if len(data["results"]) == 0 or cursor is None:
|
||||
break
|
||||
|
||||
logger.success(f"Processed {count} URL(s)")
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
from .cli_feeder import CLIFeeder
|
|
@ -1,27 +0,0 @@
|
|||
{
|
||||
"name": "CLI Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
"configs": {
|
||||
"urls": {
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"nargs": "+",
|
||||
"required": True,
|
||||
"do_not_store": True,
|
||||
"metavar": "INPUT URLS",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
|
||||
|
||||
### Features
|
||||
- Takes a single URL or a list of URLs provided via the command line.
|
||||
- Converts each URL into a `Metadata` object and yields it for processing.
|
||||
- Ensures URLs are processed only if they are explicitly provided.
|
||||
|
||||
"""
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
|
||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
|
@ -26,7 +26,6 @@
|
|||
- Supports reading URLs from multiple input files, specified as a comma-separated list.
|
||||
- Allows specifying the column number or name to extract URLs from.
|
||||
- Skips header rows if the first value is not a valid URL.
|
||||
- Integrates with the `ArchivingContext` to manage URL feeding.
|
||||
|
||||
### Setu N
|
||||
- Input files should be formatted with one URL per line.
|
||||
|
|
|
@ -20,6 +20,4 @@ class CSVFeeder(Feeder):
|
|||
url = row[0]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
|
||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
||||
ArchivingContext.set("folder", "cli")
|
|
@ -270,7 +270,11 @@ class GenericExtractor(Extractor):
|
|||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
|
||||
'quiet': False, 'noplaylist': not self.allow_playlist ,
|
||||
'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
|
||||
"live_from_start": self.live_from_start, "proxy": self.proxy,
|
||||
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
if item.netloc in ['youtube.com', 'www.youtube.com']:
|
||||
if self.cookies_from_browser:
|
||||
|
|
|
@ -7,7 +7,7 @@ import json
|
|||
import base64
|
||||
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
@ -46,7 +46,7 @@ class HtmlFormatter(Formatter):
|
|||
version=__version__
|
||||
)
|
||||
|
||||
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
|
||||
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
|
|
@ -7,7 +7,7 @@ from selenium.common.exceptions import TimeoutException
|
|||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.utils import Webdriver, UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
|
||||
|
@ -23,11 +23,11 @@ class ScreenshotEnricher(Enricher):
|
|||
try:
|
||||
driver.get(url)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
|
||||
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
if self.save_to_pdf:
|
||||
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
|
||||
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
|
||||
pdf = driver.print_page(driver.print_options)
|
||||
with open(pdf_file, "wb") as f:
|
||||
f.write(base64.b64decode(pdf))
|
||||
|
|
|
@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
|
|||
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
|
||||
|
||||
cert = ssl.get_server_certificate((domain, 443))
|
||||
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
|
||||
with open(cert_fn, "w") as f: f.write(cert)
|
||||
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
|
||||
|
|
|
@ -9,7 +9,7 @@ from tqdm import tqdm
|
|||
import re, time, json, os
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
|
@ -120,7 +120,7 @@ class TelethonArchiver(Extractor):
|
|||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
title = post.message
|
||||
|
|
|
@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
|
|||
logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
|
||||
for m_id, m in enumerate(to_enrich.media[::]):
|
||||
if m.is_video():
|
||||
folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
|
||||
folder = os.path.join(self.tmp_dir, random_str(24))
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
logger.debug(f"generating thumbnails for {m.filename}")
|
||||
duration = m.get("duration")
|
||||
|
|
|
@ -9,9 +9,7 @@ from asn1crypto import pem
|
|||
import certifi
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.core import Extractor
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
"""
|
||||
|
@ -33,7 +31,7 @@ class TimestampingEnricher(Enricher):
|
|||
logger.warning(f"No hashes found in {url=}")
|
||||
return
|
||||
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
hashes_fn = os.path.join(tmp_dir, "hashes.txt")
|
||||
|
||||
data_to_sign = "\n".join(hashes)
|
||||
|
@ -93,7 +91,7 @@ class TimestampingEnricher(Enricher):
|
|||
|
||||
cert_chain = []
|
||||
for cert in path:
|
||||
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
|
||||
with open(cert_fn, "wb") as f:
|
||||
f.write(cert.dump())
|
||||
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
|
||||
|
|
|
@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
|
|||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class VkExtractor(Extractor):
|
||||
|
@ -35,7 +35,7 @@ class VkExtractor(Extractor):
|
|||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
|
||||
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from zipfile import ZipFile
|
|||
from loguru import logger
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.core import Extractor, Enricher
|
||||
from auto_archiver.utils import UrlUtil, random_str
|
||||
|
||||
|
@ -51,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
url = to_enrich.get_url()
|
||||
|
||||
collection = random_str(8)
|
||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
||||
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||
|
||||
cmd = [
|
||||
|
@ -154,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")
|
||||
|
||||
# unzipping the .wacz
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
unzipped_dir = os.path.join(tmp_dir, "unzipped")
|
||||
with ZipFile(wacz_filename, 'r') as z_obj:
|
||||
z_obj.extractall(path=unzipped_dir)
|
||||
|
|
|
@ -2,5 +2,4 @@ import tempfile
|
|||
|
||||
from auto_archiver.core.context import ArchivingContext
|
||||
|
||||
ArchivingContext.reset(full_reset=True)
|
||||
ArchivingContext.set_tmp_dir(tempfile.gettempdir())
|
||||
ArchivingContext.reset(full_reset=True)
|
|
@ -2,6 +2,7 @@
|
|||
pytest conftest file, for shared fixtures and configuration
|
||||
"""
|
||||
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Dict, Tuple
|
||||
import hashlib
|
||||
import pytest
|
||||
|
@ -25,8 +26,13 @@ def setup_module(request):
|
|||
|
||||
m = get_module(module_name, {module_name: config})
|
||||
|
||||
# add the tmp_dir to the module
|
||||
tmp_dir = TemporaryDirectory()
|
||||
m.tmp_dir = tmp_dir
|
||||
|
||||
def cleanup():
|
||||
_LAZY_LOADED_MODULES.pop(module_name)
|
||||
tmp_dir.cleanup()
|
||||
request.addfinalizer(cleanup)
|
||||
|
||||
return m
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from argparse import ArgumentParser, ArgumentTypeError
|
||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core.config import read_yaml, store_yaml
|
||||
|
@ -113,16 +113,23 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
|
|||
|
||||
# run the orchestrator
|
||||
orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES])
|
||||
assert orchestrator.config is not None
|
||||
|
||||
# should run OK, since there are no missing required fields
|
||||
def test_load_authentication_string(orchestrator, test_args):
|
||||
|
||||
# basic_args = basic_parser.parse_known_args(test_args)
|
||||
# test_yaml = read_yaml(TEST_ORCHESTRATION)
|
||||
# test_yaml['example_module'] = {'required_field': 'some_value'}
|
||||
orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
|
||||
assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
|
||||
|
||||
# # monkey patch the example_module to have a 'configs' setting of 'my_var' with required=True
|
||||
# # load the module first
|
||||
# m = get_module_lazy("example_module")
|
||||
def test_load_authentication_string_concat_site(orchestrator, test_args):
|
||||
|
||||
orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
|
||||
assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"},
|
||||
"twitter.com": {"api_key": "my_key"}}
|
||||
|
||||
# orchestrator.setup_complete_parser(basic_args, test_yaml, unused_args=[])
|
||||
# assert orchestrator.config is not None
|
||||
def test_load_invalid_authentication_string(orchestrator, test_args):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
orchestrator.run(test_args + ["--authentication", "{\''invalid_json"])
|
||||
|
||||
def test_load_authentication_invalid_dict(orchestrator, test_args):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
orchestrator.run(test_args + ["--authentication", "[true, false]"])
|
Ładowanie…
Reference in New Issue