Further cleanup

* Removes (partly) the ArchivingOrchestrator * Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com * More unit tests * Improved error handling
2025-01-30 16:43:09 +01:00 · 2025-01-30 16:43:09 +01:00 · d6b4b7a932
commit d6b4b7a932
--- a/src/auto_archiver/main.py
+++ b/src/auto_archiver/main.py
@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
 import sys

 def main():
-    ArchivingOrchestrator().run(sys.argv)
+    ArchivingOrchestrator().run(sys.argv[1:])

 if __name__ == "__main__":
    main()
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@ -0,0 +1,100 @@
+
+
+from urllib.parse import urlparse
+from typing import  Mapping, Any
+from abc import ABC
+from copy import deepcopy, copy
+from tempfile import TemporaryDirectory
+
+from loguru import logger
+
+class BaseModule(ABC):
+
+    """
+    Base module class. All modules should inherit from this class.
+
+    The exact methods a class implements will depend on the type of module it is,
+    however all modules have a .setup(config: dict) method to run any setup code
+    (e.g. logging in to a site, spinning up a browser etc.)
+
+    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
+    a subclass can be of multiple types. For example, a module that extracts data from
+    a website and stores it in a database would be both an 'extractor' and a 'database' module.
+
+    Each module is a python package, and should have a __manifest__.py file in the
+    same directory as the module file. The __manifest__.py specifies the module information
+    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
+    default manifest structure.
+
+    """
+
+    MODULE_TYPES = [
+        'feeder',
+        'extractor',
+        'enricher',
+        'database',
+        'storage',
+        'formatter'
+    ]
+
+    _DEFAULT_MANIFEST = {
+    'name': '', # the display name of the module
+    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
+    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
+    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
+    'description': '', # a description of the module
+    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
+    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
+    'version': '1.0', # the version of the module
+    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
+}
+
+    config: Mapping[str, Any]
+    authentication: Mapping[str, Mapping[str, str]]
+    name: str
+
+    # this is set by the orchestrator prior to archiving
+    tmp_dir: TemporaryDirectory = None
+
+    def setup(self, config: dict):
+
+        authentication = config.get('authentication', {})
+        # extract out contatenated sites
+        for key, val in copy(authentication).items():
+            if "," in key:
+                for site in key.split(","):
+                    authentication[site] = val
+                del authentication[key]
+
+        # this is important. Each instance is given its own deepcopied config, so modules cannot
+        # change values to affect other modules
+        config = deepcopy(config)
+        authentication = deepcopy(config.pop('authentication', {}))
+
+        self.authentication = authentication
+        self.config = config
+        for key, val in config.get(self.name, {}).items():
+            setattr(self, key, val)
+
+    def repr(self):
+        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
+    
+    def auth_for_site(self, site: str) -> dict:
+        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
+        # for now, just hard code those.
+
+        # SECURITY: parse the domain using urllib
+        site = urlparse(site).netloc
+        # add the 'www' version of the site to the list of sites to check
+        for to_try in [site, f"www.{site}"]:
+            if to_try in self.authentication:
+                return self.authentication[to_try]
+
+        # do a fuzzy string match just to print a warning - don't use it since it's insecure
+        for key in self.authentication.keys():
+            if key in site or site in key:
+                logger.warning(f"Could not find exact authentication information for site '{site}'. \
+                                did find information for '{key}' which is close, is this what you meant? \
+                                If so, edit your authentication settings to make sure it exactly matches.")
+        
+        return {}
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -15,8 +15,14 @@ from .module import BaseModule

 from typing import Any, List, Type, Tuple

-yaml = YAML()
+yaml: YAML = YAML()

+b = yaml.load("""
+          # This is a comment
+          site.com,site2.com:
+            key: value
+            key2: value2
+          """)
 EMPTY_CONFIG = yaml.load("""
 # Auto Archiver Configuration
 # Steps are the modules that will be run in the order they are defined
@ -25,6 +31,24 @@ steps:""" + "".join([f"\n   {module}s: []" for module in BaseModule.MODULE_TYPES
 """

 # Global configuration
+
+# Authentication
+# a dictionary of authentication information that can be used by extractors to login to website. 
+# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
+# Common login 'types' are username/password, cookie, api key/token.
+# Some Examples:
+# facebook.com:
+#   username: "my_username"
+#   password: "my_password"
+# or for a site that uses an API key:
+# twitter.com,x.com:
+#   api_key
+#   api_secret
+# youtube.com:
+#   cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
+
+authentication: {}
+
 # These are the global configurations that are used by the modules

 logging:
@ -136,12 +160,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
 # TODO: make this tidier/find a way to notify of which keys should not be stored


-def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None:
+def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
    config_to_save = deepcopy(config)

-    for key1, key2 in do_not_store_keys:
-        if key1 in config_to_save and key2 in config_to_save[key1]:
-            del config_to_save[key1][key2]
-
+    config.pop('urls', None)
    with open(yaml_filename, "w", encoding="utf-8") as outf:
        yaml.dump(config_to_save, outf)
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@ -53,12 +53,4 @@ class ArchivingContext:
        if full_reset: ac.keep_on_reset = set()
        ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}

-    # ---- custom getters/setters for widely used context values
-
-    @staticmethod
-    def set_tmp_dir(tmp_dir: str):
-        ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
-
-    @staticmethod
-    def get_tmp_dir() -> str:
-        return ArchivingContext.get_instance().configs.get("tmp_dir")
+    # ---- custom getters/setters for widely used context values
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@ -12,7 +12,6 @@ from dataclasses import dataclass
 import mimetypes
 import os
 import mimetypes
-
 import requests
 from loguru import logger
 from retrying import retry
@ -71,7 +70,7 @@ class Extractor(BaseModule):
            to_filename = url.split('/')[-1].split('?')[0]
            if len(to_filename) > 64:
                to_filename = to_filename[-64:]
-        to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
+        to_filename = os.path.join(self.tmp_dir, to_filename)
        if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@ -7,7 +7,6 @@ from __future__ import annotations

 from dataclasses import dataclass
 from typing import List
-from abc import ABC
 import shutil
 import ast
 import copy
@ -17,63 +16,12 @@ import os
 from os.path import join, dirname
 from loguru import logger
 import auto_archiver
+from .base_module import BaseModule

 _LAZY_LOADED_MODULES = {}

 MANIFEST_FILE = "__manifest__.py"

-class BaseModule(ABC):
-
-    """
-    Base module class. All modules should inherit from this class.
-
-    The exact methods a class implements will depend on the type of module it is,
-    however all modules have a .setup(config: dict) method to run any setup code
-    (e.g. logging in to a site, spinning up a browser etc.)
-
-    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
-    a subclass can be of multiple types. For example, a module that extracts data from
-    a website and stores it in a database would be both an 'extractor' and a 'database' module.
-
-    Each module is a python package, and should have a __manifest__.py file in the
-    same directory as the module file. The __manifest__.py specifies the module information
-    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
-    default manifest structure.
-
-    """
-
-    MODULE_TYPES = [
-        'feeder',
-        'extractor',
-        'enricher',
-        'database',
-        'storage',
-        'formatter'
-    ]
-
-    _DEFAULT_MANIFEST = {
-    'name': '', # the display name of the module
-    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
-    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
-    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
-    'description': '', # a description of the module
-    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
-    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
-    'version': '1.0', # the version of the module
-    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
-}
-
-    config: dict
-    name: str
-
-    def setup(self, config: dict):
-        self.config = config
-        for key, val in config.get(self.name, {}).items():
-            setattr(self, key, val)
-
-    def repr(self):
-        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
-

 def setup_paths(paths: list[str]) -> None:
    """
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -5,12 +5,15 @@
 """

 from __future__ import annotations
-from typing import Generator, Union, List
+from typing import Generator, Union, List, Type
 from urllib.parse import urlparse
 from ipaddress import ip_address
 import argparse
 import os
 import sys
+import json
+from tempfile import TemporaryDirectory
+import traceback

 from rich_argparse import RichHelpFormatter

@ -18,17 +21,46 @@ from .context import ArchivingContext

 from .metadata import Metadata
 from ..version import __version__
-from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
 from .module import available_modules, LazyBaseModule, get_module, setup_paths
-from . import validators
+from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .module import BaseModule

-import tempfile, traceback
 from loguru import logger


 DEFAULT_CONFIG_FILE = "orchestration.yaml"

+class JsonParseAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        try:
+            setattr(namespace, self.dest, json.loads(values))
+        except json.JSONDecodeError as e:
+            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
+
+
+class AuthenticationJsonParseAction(JsonParseAction):
+    def __call__(self, parser, namespace, values, option_string=None):
+        super().__call__(parser, namespace, values, option_string)
+        auth_dict = getattr(namespace, self.dest)
+        if isinstance(auth_dict, str):
+            # if it's a string
+            try:
+                with open(auth_dict, 'r') as f:
+                    try:
+                        auth_dict = json.load(f)
+                    except json.JSONDecodeError:
+                        # maybe it's yaml, try that
+                        auth_dict = yaml.load(f)
+            except:
+                pass
+
+        if not isinstance(auth_dict, dict):
+            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        for site, auth in auth_dict.items():
+            if not isinstance(site, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        setattr(namespace, self.dest, auth_dict)
 class UniqueAppendAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        if not hasattr(namespace, self.dest):
@ -38,9 +70,7 @@ class UniqueAppendAction(argparse.Action):
                getattr(namespace, self.dest).append(value)

 class ArchivingOrchestrator:
-
-    _do_not_store_keys = []
-
+    
    def setup_basic_parser(self):
        parser = argparse.ArgumentParser(
                prog="auto-archiver",
@ -52,7 +82,7 @@ class ArchivingOrchestrator:
                epilog="Check the code at https://github.com/bellingcat/auto-archiver",
                formatter_class=RichHelpFormatter,
        )
-        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show this help message and exit')
+        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
        parser.add_argument('--version', action='version', version=__version__)
        parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
        parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
@ -80,7 +110,6 @@ class ArchivingOrchestrator:
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
-
            # first loads the modules from the config file, then from the command line
            for config in [yaml_config['steps'], basic_config.__dict__]:
                for module_type in BaseModule.MODULE_TYPES:
@ -120,7 +149,7 @@ class ArchivingOrchestrator:
        
        if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
            logger.info(f"Storing configuration file to {basic_config.config_file}")
-            store_yaml(self.config, basic_config.config_file, self._do_not_store_keys)
+            store_yaml(self.config, basic_config.config_file)
        
        return self.config
    
@ -128,18 +157,29 @@ class ArchivingOrchestrator:
        if not parser:
            parser = self.parser

-        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction)
+
+        # allow passing URLs directly on the command line
+        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
+
+        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
        parser.add_argument('--enrichers', dest='steps.enrichers',  nargs='+', help='the enrichers to use', action=UniqueAppendAction)
        parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
        parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
        parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
        parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)

+        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
+                                                                            (token, username etc.) that extractors can use to log into \
+                                                                            a website. If passing this on the command line, use a JSON string. \
+                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',\
+                                                                            default={},
+                                                                            action=AuthenticationJsonParseAction)
        # logging arguments
        parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO')
        parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
        parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)

+
    def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:

        if not modules:
@ -147,6 +187,7 @@ class ArchivingOrchestrator:

        module: LazyBaseModule
        for module in modules:
+
            if not module.configs:
                # this module has no configs, don't show anything in the help
                # (TODO: do we want to show something about this module though, like a description?)
@ -155,12 +196,6 @@ class ArchivingOrchestrator:
            group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")

            for name, kwargs in module.configs.items():
-                # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
-                # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
-                do_not_store = kwargs.pop('do_not_store', False)
-                if do_not_store:
-                    self._do_not_store_keys.append((module.name, name))
-                
                if not kwargs.get('metavar', None):
                    # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
                    kwargs['metavar'] = name.upper()
@ -208,8 +243,7 @@ class ArchivingOrchestrator:
            step_items = []
            modules_to_load = self.config['steps'][f"{module_type}s"]

-            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} \
-                                        in your configuration file or on the command line (using --{module_type}s)"
+            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"

            def check_steps_ok():
                if not len(step_items):
@ -223,12 +257,37 @@ class ArchivingOrchestrator:
                    exit()

            for module in modules_to_load:
+                if module == 'cli_feeder':
+                    urls = self.config['urls']
+                    if not urls:
+                        logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.")
+                        self.basic_parser.print_help()
+                        exit()
+                    # cli_feeder is a pseudo module, it just takes the command line args
+                    def feed(self) -> Generator[Metadata]:
+                        for url in urls:
+                            logger.debug(f"Processing URL: '{url}'")
+                            yield Metadata().set_url(url)
+                            ArchivingContext.set("folder", "cli")
+
+                    pseudo_module = type('CLIFeeder', (Feeder,), {
+                        'name': 'cli_feeder',
+                        'display_name': 'CLI Feeder',
+                        '__iter__': feed
+
+                    })()
+  
+
+                    pseudo_module.__iter__ = feed
+                    step_items.append(pseudo_module)
+                    continue
+
                if module in invalid_modules:
                    continue
                try:
                    loaded_module: BaseModule = get_module(module, self.config)
                except (KeyboardInterrupt, Exception) as e:
-                    logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
+                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
                    if module_type == 'extractor' and loaded_module.name == module:
                        loaded_module.cleanup()
                    exit()
@ -285,13 +344,18 @@ class ArchivingOrchestrator:

    def cleanup(self)->None:
        logger.info("Cleaning up")
-        for e in self.config['steps']['extractors']:
+        for e in self.extractors:
            e.cleanup()

    def feed(self) -> Generator[Metadata]:
-        for feeder in self.config['steps']['feeders']:
+
+        url_count = 0
+        for feeder in self.feeders:
            for item in feeder:
                yield self.feed_item(item)
+                url_count += 1
+
+        logger.success(f"Processed {url_count} URL(s)")
        self.cleanup()

    def feed_item(self, item: Metadata) -> Metadata:
@ -300,22 +364,33 @@ class ArchivingOrchestrator:
            - catches keyboard interruptions to do a clean exit
            - catches any unexpected error, logs it, and does a clean exit
        """
+        tmp_dir: TemporaryDirectory = None
        try:
-            ArchivingContext.reset()
-            with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
-                ArchivingContext.set_tmp_dir(tmp_dir)
-                return self.archive(item)
+            tmp_dir = TemporaryDirectory(dir="./")
+            # set tmp_dir on all modules
+            for m in self.all_modules:
+                m.tmp_dir = tmp_dir.name
+            return self.archive(item)
        except KeyboardInterrupt:
            # catches keyboard interruptions to do a clean exit
            logger.warning(f"caught interrupt on {item=}")
-            for d in self.config['steps']['databases']: d.aborted(item)
+            for d in self.databases:
+                d.aborted(item)
            self.cleanup()
            exit()
        except Exception as e:
            logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
-            for d in self.config['steps']['databases']:
-                if type(e) == AssertionError: d.failed(item, str(e))
-                else: d.failed(item, reason="unexpected error")
+            for d in self.databases:
+                if type(e) == AssertionError:
+                    d.failed(item, str(e))
+                else:
+                    d.failed(item, reason="unexpected error")
+        finally:
+            if tmp_dir:
+                # remove the tmp_dir from all modules
+                for m in self.all_modules:
+                    m.tmp_dir = None
+                tmp_dir.cleanup()


    def archive(self, result: Metadata) -> Union[Metadata, None]:
@ -328,31 +403,38 @@ class ArchivingOrchestrator:
            5. Store all downloaded/generated media
            6. Call selected Formatter and store formatted if needed
        """
+
        original_url = result.get_url().strip()
-        self.assert_valid_url(original_url)
+        try:
+            self.assert_valid_url(original_url)
+        except AssertionError as e:
+            logger.error(f"Error archiving URL {original_url}: {e}")
+            raise e

        # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
        url = original_url
-        for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url)
+        for a in self.extractors:
+            url = a.sanitize_url(url)
+
        result.set_url(url)
        if original_url != url: result.set("original_url", original_url)

        # 2 - notify start to DBs, propagate already archived if feature enabled in DBs
        cached_result = None
-        for d in self.config["steps"]["databases"]:
+        for d in self.databases:
            d.started(result)
            if (local_result := d.fetch(result)):
                cached_result = (cached_result or Metadata()).merge(local_result)
        if cached_result:
            logger.debug("Found previously archived entry")
-            for d in self.config["steps"]["databases"]:
+            for d in self.databases:
                try: d.done(cached_result, cached=True)
                except Exception as e:
                    logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
            return cached_result

        # 3 - call extractors until one succeeds
-        for a in self.config["steps"]["extractors"]:
+        for a in self.extractors:
            logger.info(f"Trying extractor {a.name} for {url}")
            try:
                result.merge(a.download(result))
@ -361,7 +443,7 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")

        # 4 - call enrichers to work with archived content
-        for e in self.config["steps"]["enrichers"]:
+        for e in self.enrichers:
            try: e.enrich(result)
            except Exception as exc: 
                logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
@ -370,7 +452,7 @@ class ArchivingOrchestrator:
        result.store()

        # 6 - format and store formatted if needed
-        if final_media := self.config["steps"]["formatters"][0].format(result):
+        if final_media := self.formatters[0].format(result):
            final_media.store(url=url, metadata=result)
            result.set_final_media(final_media)

@ -378,7 +460,7 @@ class ArchivingOrchestrator:
            result.status = "nothing archived"

        # signal completion to databases and archivers
-        for d in self.config["steps"]["databases"]:
+        for d in self.databases:
            try: d.done(result)
            except Exception as e:
                logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
@ -403,4 +485,44 @@ class ArchivingOrchestrator:
            assert ip.is_global, f"Invalid IP used"
            assert not ip.is_reserved, f"Invalid IP used"
            assert not ip.is_link_local, f"Invalid IP used"
-            assert not ip.is_private, f"Invalid IP used"
+            assert not ip.is_private, f"Invalid IP used"
+
+
+    # Helper Properties
+
+    @property
+    def feeders(self) -> List[Type[Feeder]]:
+        return self._get_property('feeders')
+    
+    @property
+    def extractors(self) -> List[Type[Extractor]]:
+        return self._get_property('extractors')
+    
+    @property
+    def enrichers(self) -> List[Type[Enricher]]:
+        return self._get_property('enrichers')
+    
+    @property
+    def databases(self) -> List[Type[Database]]:
+        return self._get_property('databases')
+    
+    @property
+    def storages(self) -> List[Type[Storage]]:
+        return self._get_property('storages')
+    
+    @property
+    def formatters(self) -> List[Type[Formatter]]:
+        return self._get_property('formatters')
+    
+    @property
+    def all_modules(self) -> List[Type[BaseModule]]:
+        return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
+    
+    def _get_property(self, prop):
+        try:
+            f = self.config['steps'][prop]
+            if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)):
+                raise TypeError
+            return f
+        except:
+            exit("Property called prior to full initialisation")
--- a/src/auto_archiver/enrichers/screenshot_enricher.py
+++ b/src/auto_archiver/enrichers/screenshot_enricher.py
@ -0,0 +1,40 @@
+from loguru import logger
+import time, os
+from selenium.common.exceptions import TimeoutException
+
+
+from auto_archiver.core import Enricher
+from ..utils import Webdriver, UrlUtil, random_str  
+from ..core import Media, Metadata
+
+class ScreenshotEnricher(Enricher):
+    name = "screenshot_enricher"
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "width": {"default": 1280, "help": "width of the screenshots"},
+            "height": {"default": 720, "help": "height of the screenshots"},
+            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
+            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
+            "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
+        }
+
+    def enrich(self, to_enrich: Metadata) -> None:
+        url = to_enrich.get_url()
+        if UrlUtil.is_auth_wall(url):
+            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
+            return
+
+        logger.debug(f"Enriching screenshot for {url=}")
+        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver:
+            try:
+                driver.get(url)
+                time.sleep(int(self.sleep_before_screenshot))
+                screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
+                driver.save_screenshot(screenshot_file)
+                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
+            except TimeoutException:
+                logger.info("TimeoutException loading page for screenshot")
+            except Exception as e:
+                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
--- a/src/auto_archiver/feeders/csv_feeder.py
+++ b/src/auto_archiver/feeders/csv_feeder.py
@ -0,0 +1,38 @@
+from loguru import logger
+import csv
+
+from . import Feeder
+from ..core import Metadata, ArchivingContext
+from ..utils import url_or_none
+
+class CSVFeeder(Feeder):
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "files": {
+                "default": None,
+                "help": "Path to the input file(s) to read the URLs from, comma separated. \
+                        Input files should be formatted with one URL per line",
+                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
+            },
+            "column": {
+                "default": None,
+                "help": "Column number or name to read the URLs from, 0-indexed",
+            }
+        }
+    
+
+    def __iter__(self) -> Metadata:
+        url_column = self.column or 0
+        for file in self.files:
+            with open(file, "r") as f:
+                reader = csv.reader(f)
+                first_row = next(reader)
+                if not(url_or_none(first_row[url_column])):
+                    # it's a header row, skip it
+                for row in reader:
+                    url = row[0]
+                    logger.debug(f"Processing {url}")
+                    yield Metadata().set_url(url)
+            ArchivingContext.set("folder", "cli")
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@ -40,5 +40,3 @@ class AtlosFeeder(Feeder):

            if len(data["results"]) == 0 or cursor is None:
                break
-
-        logger.success(f"Processed {count} URL(s)")
--- a/src/auto_archiver/modules/cli_feeder/init.py
+++ b/src/auto_archiver/modules/cli_feeder/init.py
@ -1 +0,0 @@
-from .cli_feeder import CLIFeeder
--- a/src/auto_archiver/modules/cli_feeder/manifest.py
+++ b/src/auto_archiver/modules/cli_feeder/manifest.py
@ -1,27 +0,0 @@
-{
-    "name": "CLI Feeder",
-    "type": ["feeder"],
-    "requires_setup": False,
-    "dependencies": {
-        "python": ["loguru"],
-    },
-    'entry_point': 'cli_feeder::CLIFeeder',
-    "configs": {
-        "urls": {
-            "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-            "nargs": "+",
-            "required": True,
-            "do_not_store": True,
-            "metavar": "INPUT URLS",
-        },
-    },
-    "description": """
-    Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
-
-    ### Features
-    - Takes a single URL or a list of URLs provided via the command line.
-    - Converts each URL into a `Metadata` object and yields it for processing.
-    - Ensures URLs are processed only if they are explicitly provided.
-
-    """
-}
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@ -1,15 +0,0 @@
-from loguru import logger
-
-from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
-
-
-class CLIFeeder(Feeder):
-
-    def __iter__(self) -> Metadata:
-        for url in self.urls:
-            logger.debug(f"Processing URL: '{url}'")
-            yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
--- a/src/auto_archiver/modules/csv_feeder/manifest.py
+++ b/src/auto_archiver/modules/csv_feeder/manifest.py
@ -26,7 +26,6 @@
    - Supports reading URLs from multiple input files, specified as a comma-separated list.
    - Allows specifying the column number or name to extract URLs from.
    - Skips header rows if the first value is not a valid URL.
-    - Integrates with the `ArchivingContext` to manage URL feeding.

    ### Setu N
    - Input files should be formatted with one URL per line.
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@ -20,6 +20,4 @@ class CSVFeeder(Feeder):
                    url = row[0]
                    logger.debug(f"Processing {url}")
                    yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
+            ArchivingContext.set("folder", "cli")
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@ -270,7 +270,11 @@ class GenericExtractor(Extractor):
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
+                       'quiet': False, 'noplaylist': not self.allow_playlist ,
+                       'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
+                       "live_from_start": self.live_from_start, "proxy": self.proxy,
+                       "max_downloads": self.max_downloads, "playlistend": self.max_downloads}

        if item.netloc in ['youtube.com', 'www.youtube.com']:
            if self.cookies_from_browser:
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@ -7,7 +7,7 @@ import json
 import base64

 from auto_archiver.version import __version__
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
@ -46,7 +46,7 @@ class HtmlFormatter(Formatter):
            version=__version__
        )

-        html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
+        html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
        with open(html_path, mode="w", encoding="utf-8") as outf:
            outf.write(content)
        final_media = Media(filename=html_path, _mimetype="text/html")
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@ -7,7 +7,7 @@ from selenium.common.exceptions import TimeoutException

 from auto_archiver.core import Enricher
 from auto_archiver.utils import Webdriver, UrlUtil, random_str
-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata

 class ScreenshotEnricher(Enricher):

@ -23,11 +23,11 @@ class ScreenshotEnricher(Enricher):
            try:
                driver.get(url)
                time.sleep(int(self.sleep_before_screenshot))
-                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
+                screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
                driver.save_screenshot(screenshot_file)
                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
                if self.save_to_pdf:
-                    pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
+                    pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
                    pdf = driver.print_page(driver.print_options)
                    with open(pdf_file, "wb") as f:
                        f.write(base64.b64decode(pdf))
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
        logger.debug(f"fetching SSL certificate for {domain=} in {url=}")

        cert = ssl.get_server_certificate((domain, 443))
-        cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
+        cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
        with open(cert_fn, "w") as f: f.write(cert)
        to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@ -9,7 +9,7 @@ from tqdm import tqdm
 import re, time, json, os

 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str


@ -120,7 +120,7 @@ class TelethonArchiver(Extractor):
            media_posts = self._get_media_posts_in_group(chat, post)
            logger.debug(f'got {len(media_posts)=} for {url=}')

-            tmp_dir = ArchivingContext.get_tmp_dir()
+            tmp_dir = self.tmp_dir

            group_id = post.grouped_id if post.grouped_id is not None else post.id
            title = post.message
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
        logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
        for m_id, m in enumerate(to_enrich.media[::]):
            if m.is_video():
-                folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
+                folder = os.path.join(self.tmp_dir, random_str(24))
                os.makedirs(folder, exist_ok=True)
                logger.debug(f"generating thumbnails for {m.filename}")
                duration = m.get("duration")
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@ -9,9 +9,7 @@ from asn1crypto import pem
 import certifi

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext, Media
-from auto_archiver.core import Extractor
-
+from auto_archiver.core import Metadata, Media

 class TimestampingEnricher(Enricher):
    """
@ -33,7 +31,7 @@ class TimestampingEnricher(Enricher):
            logger.warning(f"No hashes found in {url=}")
            return
        
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        hashes_fn = os.path.join(tmp_dir, "hashes.txt")

        data_to_sign = "\n".join(hashes)
@ -93,7 +91,7 @@ class TimestampingEnricher(Enricher):

        cert_chain = []
        for cert in path:
-            cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
+            cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
            with open(cert_fn, "wb") as f:
                f.write(cert.dump())
            cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper

 from auto_archiver.utils.misc import dump_payload
 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media


 class VkExtractor(Extractor):
@ -35,7 +35,7 @@ class VkExtractor(Extractor):

        result.set_content(dump_payload(vk_scrapes))

-        filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
+        filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
        for filename in filenames:
            result.add_media(Media(filename))

--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@ -5,7 +5,7 @@ from zipfile import ZipFile
 from loguru import logger
 from warcio.archiveiterator import ArchiveIterator

-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata
 from auto_archiver.core import Extractor, Enricher
 from auto_archiver.utils import UrlUtil, random_str

@ -51,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
        url = to_enrich.get_url()

        collection = random_str(8)
-        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
+        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

        cmd = [
@ -154,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
        logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")

        # unzipping the .wacz
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        unzipped_dir = os.path.join(tmp_dir, "unzipped")
        with ZipFile(wacz_filename, 'r') as z_obj:
            z_obj.extractall(path=unzipped_dir)
--- a/tests/init.py
+++ b/tests/init.py
@ -2,5 +2,4 @@ import tempfile

 from auto_archiver.core.context import ArchivingContext

-ArchivingContext.reset(full_reset=True)
-ArchivingContext.set_tmp_dir(tempfile.gettempdir())
+ArchivingContext.reset(full_reset=True)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -2,6 +2,7 @@
 pytest conftest file, for shared fixtures and configuration
 """

+from tempfile import TemporaryDirectory
 from typing import Dict, Tuple
 import hashlib
 import pytest
@ -25,8 +26,13 @@ def setup_module(request):

        m = get_module(module_name, {module_name: config})

+        # add the tmp_dir to the module
+        tmp_dir = TemporaryDirectory()
+        m.tmp_dir = tmp_dir
+
        def cleanup():
            _LAZY_LOADED_MODULES.pop(module_name)
+            tmp_dir.cleanup()
        request.addfinalizer(cleanup)

        return m
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@ -1,6 +1,6 @@
 import pytest
 import sys
-from argparse import ArgumentParser
+from argparse import ArgumentParser, ArgumentTypeError
 from auto_archiver.core.orchestrator import ArchivingOrchestrator
 from auto_archiver.version import __version__
 from auto_archiver.core.config import read_yaml, store_yaml
@ -113,16 +113,23 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):

    # run the orchestrator
    orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES])
+    assert orchestrator.config is not None

-    # should run OK, since there are no missing required fields
+def test_load_authentication_string(orchestrator, test_args):

-    # basic_args = basic_parser.parse_known_args(test_args)
-    # test_yaml = read_yaml(TEST_ORCHESTRATION)
-    # test_yaml['example_module'] = {'required_field': 'some_value'}
+    orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
+    assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}

-    # # monkey patch the example_module to have a 'configs' setting of 'my_var' with required=True
-    # # load the module first
-    # m = get_module_lazy("example_module")
+def test_load_authentication_string_concat_site(orchestrator, test_args):
+    
+    orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
+    assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"},
+                                                     "twitter.com": {"api_key": "my_key"}}

-    # orchestrator.setup_complete_parser(basic_args, test_yaml, unused_args=[])
-    # assert orchestrator.config is not None
+def test_load_invalid_authentication_string(orchestrator, test_args):
+    with pytest.raises(ArgumentTypeError):
+        orchestrator.run(test_args + ["--authentication", "{\''invalid_json"])
+
+def test_load_authentication_invalid_dict(orchestrator, test_args):
+    with pytest.raises(ArgumentTypeError):
+        orchestrator.run(test_args + ["--authentication", "[true, false]"])