From d6b4b7a932b7c8840265890583b79dc7e5038b47 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Thu, 30 Jan 2025 16:43:09 +0100
Subject: [PATCH] Further cleanup

* Removes (partly) the ArchivingOrchestrator
* Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com
* More unit tests
* Improved error handling
---
 src/auto_archiver/__main__.py                 |   2 +-
 src/auto_archiver/core/base_module.py         | 100 +++++++++
 src/auto_archiver/core/config.py              |  33 ++-
 src/auto_archiver/core/context.py             |  10 +-
 src/auto_archiver/core/extractor.py           |   3 +-
 src/auto_archiver/core/module.py              |  54 +----
 src/auto_archiver/core/orchestrator.py        | 200 ++++++++++++++----
 .../enrichers/screenshot_enricher.py          |  40 ++++
 src/auto_archiver/feeders/csv_feeder.py       |  38 ++++
 .../modules/atlos_feeder/atlos_feeder.py      |   2 -
 .../modules/cli_feeder/__init__.py            |   1 -
 .../modules/cli_feeder/__manifest__.py        |  27 ---
 .../modules/cli_feeder/cli_feeder.py          |  15 --
 .../modules/csv_feeder/__manifest__.py        |   1 -
 .../modules/csv_feeder/csv_feeder.py          |   4 +-
 .../generic_extractor/generic_extractor.py    |   6 +-
 .../modules/html_formatter/html_formatter.py  |   4 +-
 .../screenshot_enricher.py                    |   6 +-
 .../modules/ssl_enricher/ssl_enricher.py      |   2 +-
 .../telethon_extractor/telethon_extractor.py  |   4 +-
 .../thumbnail_enricher/thumbnail_enricher.py  |   2 +-
 .../timestamping_enricher.py                  |   8 +-
 .../modules/vk_extractor/vk_extractor.py      |   4 +-
 .../modules/wacz_enricher/wacz_enricher.py    |   6 +-
 tests/__init__.py                             |   3 +-
 tests/conftest.py                             |   6 +
 tests/test_orchestrator.py                    |  27 ++-
 27 files changed, 417 insertions(+), 191 deletions(-)
 create mode 100644 src/auto_archiver/core/base_module.py
 create mode 100644 src/auto_archiver/enrichers/screenshot_enricher.py
 create mode 100644 src/auto_archiver/feeders/csv_feeder.py
 delete mode 100644 src/auto_archiver/modules/cli_feeder/__init__.py
 delete mode 100644 src/auto_archiver/modules/cli_feeder/__manifest__.py
 delete mode 100644 src/auto_archiver/modules/cli_feeder/cli_feeder.py

diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py
index 0e2f54f..0023a59 100644
--- a/src/auto_archiver/__main__.py
+++ b/src/auto_archiver/__main__.py
@@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
 import sys
 
 def main():
-    ArchivingOrchestrator().run(sys.argv)
+    ArchivingOrchestrator().run(sys.argv[1:])
 
 if __name__ == "__main__":
     main()
diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py
new file mode 100644
index 0000000..a9a904f
--- /dev/null
+++ b/src/auto_archiver/core/base_module.py
@@ -0,0 +1,100 @@
+
+
+from urllib.parse import urlparse
+from typing import  Mapping, Any
+from abc import ABC
+from copy import deepcopy, copy
+from tempfile import TemporaryDirectory
+
+from loguru import logger
+
+class BaseModule(ABC):
+
+    """
+    Base module class. All modules should inherit from this class.
+
+    The exact methods a class implements will depend on the type of module it is,
+    however all modules have a .setup(config: dict) method to run any setup code
+    (e.g. logging in to a site, spinning up a browser etc.)
+
+    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
+    a subclass can be of multiple types. For example, a module that extracts data from
+    a website and stores it in a database would be both an 'extractor' and a 'database' module.
+
+    Each module is a python package, and should have a __manifest__.py file in the
+    same directory as the module file. The __manifest__.py specifies the module information
+    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
+    default manifest structure.
+
+    """
+
+    MODULE_TYPES = [
+        'feeder',
+        'extractor',
+        'enricher',
+        'database',
+        'storage',
+        'formatter'
+    ]
+
+    _DEFAULT_MANIFEST = {
+    'name': '', # the display name of the module
+    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
+    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
+    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
+    'description': '', # a description of the module
+    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
+    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
+    'version': '1.0', # the version of the module
+    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
+}
+
+    config: Mapping[str, Any]
+    authentication: Mapping[str, Mapping[str, str]]
+    name: str
+
+    # this is set by the orchestrator prior to archiving
+    tmp_dir: TemporaryDirectory = None
+
+    def setup(self, config: dict):
+
+        authentication = config.get('authentication', {})
+        # extract out contatenated sites
+        for key, val in copy(authentication).items():
+            if "," in key:
+                for site in key.split(","):
+                    authentication[site] = val
+                del authentication[key]
+
+        # this is important. Each instance is given its own deepcopied config, so modules cannot
+        # change values to affect other modules
+        config = deepcopy(config)
+        authentication = deepcopy(config.pop('authentication', {}))
+
+        self.authentication = authentication
+        self.config = config
+        for key, val in config.get(self.name, {}).items():
+            setattr(self, key, val)
+
+    def repr(self):
+        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
+    
+    def auth_for_site(self, site: str) -> dict:
+        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
+        # for now, just hard code those.
+
+        # SECURITY: parse the domain using urllib
+        site = urlparse(site).netloc
+        # add the 'www' version of the site to the list of sites to check
+        for to_try in [site, f"www.{site}"]:
+            if to_try in self.authentication:
+                return self.authentication[to_try]
+
+        # do a fuzzy string match just to print a warning - don't use it since it's insecure
+        for key in self.authentication.keys():
+            if key in site or site in key:
+                logger.warning(f"Could not find exact authentication information for site '{site}'. \
+                                did find information for '{key}' which is close, is this what you meant? \
+                                If so, edit your authentication settings to make sure it exactly matches.")
+        
+        return {}
\ No newline at end of file
diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py
index ca8ed25..2d462e4 100644
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -15,8 +15,14 @@ from .module import BaseModule
 
 from typing import Any, List, Type, Tuple
 
-yaml = YAML()
+yaml: YAML = YAML()
 
+b = yaml.load("""
+          # This is a comment
+          site.com,site2.com:
+            key: value
+            key2: value2
+          """)
 EMPTY_CONFIG = yaml.load("""
 # Auto Archiver Configuration
 # Steps are the modules that will be run in the order they are defined
@@ -25,6 +31,24 @@ steps:""" + "".join([f"\n   {module}s: []" for module in BaseModule.MODULE_TYPES
 """
 
 # Global configuration
+
+# Authentication
+# a dictionary of authentication information that can be used by extractors to login to website. 
+# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
+# Common login 'types' are username/password, cookie, api key/token.
+# Some Examples:
+# facebook.com:
+#   username: "my_username"
+#   password: "my_password"
+# or for a site that uses an API key:
+# twitter.com,x.com:
+#   api_key
+#   api_secret
+# youtube.com:
+#   cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
+
+authentication: {}
+
 # These are the global configurations that are used by the modules
 
 logging:
@@ -136,12 +160,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
 # TODO: make this tidier/find a way to notify of which keys should not be stored
 
 
-def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None:
+def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
     config_to_save = deepcopy(config)
 
-    for key1, key2 in do_not_store_keys:
-        if key1 in config_to_save and key2 in config_to_save[key1]:
-            del config_to_save[key1][key2]
-
+    config.pop('urls', None)
     with open(yaml_filename, "w", encoding="utf-8") as outf:
         yaml.dump(config_to_save, outf)
\ No newline at end of file
diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py
index 9a21b5c..0db5359 100644
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@@ -53,12 +53,4 @@ class ArchivingContext:
         if full_reset: ac.keep_on_reset = set()
         ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
 
-    # ---- custom getters/setters for widely used context values
-
-    @staticmethod
-    def set_tmp_dir(tmp_dir: str):
-        ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
-
-    @staticmethod
-    def get_tmp_dir() -> str:
-        return ArchivingContext.get_instance().configs.get("tmp_dir")
+    # ---- custom getters/setters for widely used context values
\ No newline at end of file
diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py
index ed261eb..b0d80bc 100644
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@@ -12,7 +12,6 @@ from dataclasses import dataclass
 import mimetypes
 import os
 import mimetypes
-
 import requests
 from loguru import logger
 from retrying import retry
@@ -71,7 +70,7 @@ class Extractor(BaseModule):
             to_filename = url.split('/')[-1].split('?')[0]
             if len(to_filename) > 64:
                 to_filename = to_filename[-64:]
-        to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
+        to_filename = os.path.join(self.tmp_dir, to_filename)
         if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py
index 4542b88..501f238 100644
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -7,7 +7,6 @@ from __future__ import annotations
 
 from dataclasses import dataclass
 from typing import List
-from abc import ABC
 import shutil
 import ast
 import copy
@@ -17,63 +16,12 @@ import os
 from os.path import join, dirname
 from loguru import logger
 import auto_archiver
+from .base_module import BaseModule
 
 _LAZY_LOADED_MODULES = {}
 
 MANIFEST_FILE = "__manifest__.py"
 
-class BaseModule(ABC):
-
-    """
-    Base module class. All modules should inherit from this class.
-
-    The exact methods a class implements will depend on the type of module it is,
-    however all modules have a .setup(config: dict) method to run any setup code
-    (e.g. logging in to a site, spinning up a browser etc.)
-
-    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
-    a subclass can be of multiple types. For example, a module that extracts data from
-    a website and stores it in a database would be both an 'extractor' and a 'database' module.
-
-    Each module is a python package, and should have a __manifest__.py file in the
-    same directory as the module file. The __manifest__.py specifies the module information
-    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
-    default manifest structure.
-
-    """
-
-    MODULE_TYPES = [
-        'feeder',
-        'extractor',
-        'enricher',
-        'database',
-        'storage',
-        'formatter'
-    ]
-
-    _DEFAULT_MANIFEST = {
-    'name': '', # the display name of the module
-    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
-    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
-    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
-    'description': '', # a description of the module
-    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
-    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
-    'version': '1.0', # the version of the module
-    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
-}
-
-    config: dict
-    name: str
-
-    def setup(self, config: dict):
-        self.config = config
-        for key, val in config.get(self.name, {}).items():
-            setattr(self, key, val)
-
-    def repr(self):
-        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
-
 
 def setup_paths(paths: list[str]) -> None:
     """
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index ba46492..ad11849 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -5,12 +5,15 @@
 """
 
 from __future__ import annotations
-from typing import Generator, Union, List
+from typing import Generator, Union, List, Type
 from urllib.parse import urlparse
 from ipaddress import ip_address
 import argparse
 import os
 import sys
+import json
+from tempfile import TemporaryDirectory
+import traceback
 
 from rich_argparse import RichHelpFormatter
 
@@ -18,17 +21,46 @@ from .context import ArchivingContext
 
 from .metadata import Metadata
 from ..version import __version__
-from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
 from .module import available_modules, LazyBaseModule, get_module, setup_paths
-from . import validators
+from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .module import BaseModule
 
-import tempfile, traceback
 from loguru import logger
 
 
 DEFAULT_CONFIG_FILE = "orchestration.yaml"
 
+class JsonParseAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        try:
+            setattr(namespace, self.dest, json.loads(values))
+        except json.JSONDecodeError as e:
+            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
+
+
+class AuthenticationJsonParseAction(JsonParseAction):
+    def __call__(self, parser, namespace, values, option_string=None):
+        super().__call__(parser, namespace, values, option_string)
+        auth_dict = getattr(namespace, self.dest)
+        if isinstance(auth_dict, str):
+            # if it's a string
+            try:
+                with open(auth_dict, 'r') as f:
+                    try:
+                        auth_dict = json.load(f)
+                    except json.JSONDecodeError:
+                        # maybe it's yaml, try that
+                        auth_dict = yaml.load(f)
+            except:
+                pass
+
+        if not isinstance(auth_dict, dict):
+            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        for site, auth in auth_dict.items():
+            if not isinstance(site, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        setattr(namespace, self.dest, auth_dict)
 class UniqueAppendAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
         if not hasattr(namespace, self.dest):
@@ -38,9 +70,7 @@ class UniqueAppendAction(argparse.Action):
                 getattr(namespace, self.dest).append(value)
 
 class ArchivingOrchestrator:
-
-    _do_not_store_keys = []
-
+    
     def setup_basic_parser(self):
         parser = argparse.ArgumentParser(
                 prog="auto-archiver",
@@ -52,7 +82,7 @@ class ArchivingOrchestrator:
                 epilog="Check the code at https://github.com/bellingcat/auto-archiver",
                 formatter_class=RichHelpFormatter,
         )
-        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show this help message and exit')
+        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
         parser.add_argument('--version', action='version', version=__version__)
         parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
         parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
@@ -80,7 +110,6 @@ class ArchivingOrchestrator:
             # only load the modules enabled in config
             # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
             enabled_modules = []
-
             # first loads the modules from the config file, then from the command line
             for config in [yaml_config['steps'], basic_config.__dict__]:
                 for module_type in BaseModule.MODULE_TYPES:
@@ -120,7 +149,7 @@ class ArchivingOrchestrator:
         
         if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
             logger.info(f"Storing configuration file to {basic_config.config_file}")
-            store_yaml(self.config, basic_config.config_file, self._do_not_store_keys)
+            store_yaml(self.config, basic_config.config_file)
         
         return self.config
     
@@ -128,18 +157,29 @@ class ArchivingOrchestrator:
         if not parser:
             parser = self.parser
 
-        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction)
+
+        # allow passing URLs directly on the command line
+        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
+
+        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
         parser.add_argument('--enrichers', dest='steps.enrichers',  nargs='+', help='the enrichers to use', action=UniqueAppendAction)
         parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
         parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
         parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
         parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
 
+        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
+                                                                            (token, username etc.) that extractors can use to log into \
+                                                                            a website. If passing this on the command line, use a JSON string. \
+                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',\
+                                                                            default={},
+                                                                            action=AuthenticationJsonParseAction)
         # logging arguments
         parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO')
         parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
         parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
 
+
     def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
 
         if not modules:
@@ -147,6 +187,7 @@ class ArchivingOrchestrator:
 
         module: LazyBaseModule
         for module in modules:
+
             if not module.configs:
                 # this module has no configs, don't show anything in the help
                 # (TODO: do we want to show something about this module though, like a description?)
@@ -155,12 +196,6 @@ class ArchivingOrchestrator:
             group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
 
             for name, kwargs in module.configs.items():
-                # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
-                # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
-                do_not_store = kwargs.pop('do_not_store', False)
-                if do_not_store:
-                    self._do_not_store_keys.append((module.name, name))
-                
                 if not kwargs.get('metavar', None):
                     # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
                     kwargs['metavar'] = name.upper()
@@ -208,8 +243,7 @@ class ArchivingOrchestrator:
             step_items = []
             modules_to_load = self.config['steps'][f"{module_type}s"]
 
-            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} \
-                                        in your configuration file or on the command line (using --{module_type}s)"
+            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
 
             def check_steps_ok():
                 if not len(step_items):
@@ -223,12 +257,37 @@ class ArchivingOrchestrator:
                     exit()
 
             for module in modules_to_load:
+                if module == 'cli_feeder':
+                    urls = self.config['urls']
+                    if not urls:
+                        logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.")
+                        self.basic_parser.print_help()
+                        exit()
+                    # cli_feeder is a pseudo module, it just takes the command line args
+                    def feed(self) -> Generator[Metadata]:
+                        for url in urls:
+                            logger.debug(f"Processing URL: '{url}'")
+                            yield Metadata().set_url(url)
+                            ArchivingContext.set("folder", "cli")
+
+                    pseudo_module = type('CLIFeeder', (Feeder,), {
+                        'name': 'cli_feeder',
+                        'display_name': 'CLI Feeder',
+                        '__iter__': feed
+
+                    })()
+  
+
+                    pseudo_module.__iter__ = feed
+                    step_items.append(pseudo_module)
+                    continue
+
                 if module in invalid_modules:
                     continue
                 try:
                     loaded_module: BaseModule = get_module(module, self.config)
                 except (KeyboardInterrupt, Exception) as e:
-                    logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
+                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
                     if module_type == 'extractor' and loaded_module.name == module:
                         loaded_module.cleanup()
                     exit()
@@ -285,13 +344,18 @@ class ArchivingOrchestrator:
 
     def cleanup(self)->None:
         logger.info("Cleaning up")
-        for e in self.config['steps']['extractors']:
+        for e in self.extractors:
             e.cleanup()
 
     def feed(self) -> Generator[Metadata]:
-        for feeder in self.config['steps']['feeders']:
+
+        url_count = 0
+        for feeder in self.feeders:
             for item in feeder:
                 yield self.feed_item(item)
+                url_count += 1
+
+        logger.success(f"Processed {url_count} URL(s)")
         self.cleanup()
 
     def feed_item(self, item: Metadata) -> Metadata:
@@ -300,22 +364,33 @@ class ArchivingOrchestrator:
             - catches keyboard interruptions to do a clean exit
             - catches any unexpected error, logs it, and does a clean exit
         """
+        tmp_dir: TemporaryDirectory = None
         try:
-            ArchivingContext.reset()
-            with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
-                ArchivingContext.set_tmp_dir(tmp_dir)
-                return self.archive(item)
+            tmp_dir = TemporaryDirectory(dir="./")
+            # set tmp_dir on all modules
+            for m in self.all_modules:
+                m.tmp_dir = tmp_dir.name
+            return self.archive(item)
         except KeyboardInterrupt:
             # catches keyboard interruptions to do a clean exit
             logger.warning(f"caught interrupt on {item=}")
-            for d in self.config['steps']['databases']: d.aborted(item)
+            for d in self.databases:
+                d.aborted(item)
             self.cleanup()
             exit()
         except Exception as e:
             logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
-            for d in self.config['steps']['databases']:
-                if type(e) == AssertionError: d.failed(item, str(e))
-                else: d.failed(item, reason="unexpected error")
+            for d in self.databases:
+                if type(e) == AssertionError:
+                    d.failed(item, str(e))
+                else:
+                    d.failed(item, reason="unexpected error")
+        finally:
+            if tmp_dir:
+                # remove the tmp_dir from all modules
+                for m in self.all_modules:
+                    m.tmp_dir = None
+                tmp_dir.cleanup()
 
 
     def archive(self, result: Metadata) -> Union[Metadata, None]:
@@ -328,31 +403,38 @@ class ArchivingOrchestrator:
             5. Store all downloaded/generated media
             6. Call selected Formatter and store formatted if needed
         """
+
         original_url = result.get_url().strip()
-        self.assert_valid_url(original_url)
+        try:
+            self.assert_valid_url(original_url)
+        except AssertionError as e:
+            logger.error(f"Error archiving URL {original_url}: {e}")
+            raise e
 
         # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
         url = original_url
-        for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url)
+        for a in self.extractors:
+            url = a.sanitize_url(url)
+
         result.set_url(url)
         if original_url != url: result.set("original_url", original_url)
 
         # 2 - notify start to DBs, propagate already archived if feature enabled in DBs
         cached_result = None
-        for d in self.config["steps"]["databases"]:
+        for d in self.databases:
             d.started(result)
             if (local_result := d.fetch(result)):
                 cached_result = (cached_result or Metadata()).merge(local_result)
         if cached_result:
             logger.debug("Found previously archived entry")
-            for d in self.config["steps"]["databases"]:
+            for d in self.databases:
                 try: d.done(cached_result, cached=True)
                 except Exception as e:
                     logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
             return cached_result
 
         # 3 - call extractors until one succeeds
-        for a in self.config["steps"]["extractors"]:
+        for a in self.extractors:
             logger.info(f"Trying extractor {a.name} for {url}")
             try:
                 result.merge(a.download(result))
@@ -361,7 +443,7 @@ class ArchivingOrchestrator:
                 logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
 
         # 4 - call enrichers to work with archived content
-        for e in self.config["steps"]["enrichers"]:
+        for e in self.enrichers:
             try: e.enrich(result)
             except Exception as exc: 
                 logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
@@ -370,7 +452,7 @@ class ArchivingOrchestrator:
         result.store()
 
         # 6 - format and store formatted if needed
-        if final_media := self.config["steps"]["formatters"][0].format(result):
+        if final_media := self.formatters[0].format(result):
             final_media.store(url=url, metadata=result)
             result.set_final_media(final_media)
 
@@ -378,7 +460,7 @@ class ArchivingOrchestrator:
             result.status = "nothing archived"
 
         # signal completion to databases and archivers
-        for d in self.config["steps"]["databases"]:
+        for d in self.databases:
             try: d.done(result)
             except Exception as e:
                 logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
@@ -403,4 +485,44 @@ class ArchivingOrchestrator:
             assert ip.is_global, f"Invalid IP used"
             assert not ip.is_reserved, f"Invalid IP used"
             assert not ip.is_link_local, f"Invalid IP used"
-            assert not ip.is_private, f"Invalid IP used"
\ No newline at end of file
+            assert not ip.is_private, f"Invalid IP used"
+
+
+    # Helper Properties
+
+    @property
+    def feeders(self) -> List[Type[Feeder]]:
+        return self._get_property('feeders')
+    
+    @property
+    def extractors(self) -> List[Type[Extractor]]:
+        return self._get_property('extractors')
+    
+    @property
+    def enrichers(self) -> List[Type[Enricher]]:
+        return self._get_property('enrichers')
+    
+    @property
+    def databases(self) -> List[Type[Database]]:
+        return self._get_property('databases')
+    
+    @property
+    def storages(self) -> List[Type[Storage]]:
+        return self._get_property('storages')
+    
+    @property
+    def formatters(self) -> List[Type[Formatter]]:
+        return self._get_property('formatters')
+    
+    @property
+    def all_modules(self) -> List[Type[BaseModule]]:
+        return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
+    
+    def _get_property(self, prop):
+        try:
+            f = self.config['steps'][prop]
+            if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)):
+                raise TypeError
+            return f
+        except:
+            exit("Property called prior to full initialisation")
\ No newline at end of file
diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py
new file mode 100644
index 0000000..0d05d92
--- /dev/null
+++ b/src/auto_archiver/enrichers/screenshot_enricher.py
@@ -0,0 +1,40 @@
+from loguru import logger
+import time, os
+from selenium.common.exceptions import TimeoutException
+
+
+from auto_archiver.core import Enricher
+from ..utils import Webdriver, UrlUtil, random_str  
+from ..core import Media, Metadata
+
+class ScreenshotEnricher(Enricher):
+    name = "screenshot_enricher"
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "width": {"default": 1280, "help": "width of the screenshots"},
+            "height": {"default": 720, "help": "height of the screenshots"},
+            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
+            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
+            "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
+        }
+
+    def enrich(self, to_enrich: Metadata) -> None:
+        url = to_enrich.get_url()
+        if UrlUtil.is_auth_wall(url):
+            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
+            return
+
+        logger.debug(f"Enriching screenshot for {url=}")
+        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver:
+            try:
+                driver.get(url)
+                time.sleep(int(self.sleep_before_screenshot))
+                screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
+                driver.save_screenshot(screenshot_file)
+                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
+            except TimeoutException:
+                logger.info("TimeoutException loading page for screenshot")
+            except Exception as e:
+                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/feeders/csv_feeder.py
new file mode 100644
index 0000000..e9da518
--- /dev/null
+++ b/src/auto_archiver/feeders/csv_feeder.py
@@ -0,0 +1,38 @@
+from loguru import logger
+import csv
+
+from . import Feeder
+from ..core import Metadata, ArchivingContext
+from ..utils import url_or_none
+
+class CSVFeeder(Feeder):
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "files": {
+                "default": None,
+                "help": "Path to the input file(s) to read the URLs from, comma separated. \
+                        Input files should be formatted with one URL per line",
+                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
+            },
+            "column": {
+                "default": None,
+                "help": "Column number or name to read the URLs from, 0-indexed",
+            }
+        }
+    
+
+    def __iter__(self) -> Metadata:
+        url_column = self.column or 0
+        for file in self.files:
+            with open(file, "r") as f:
+                reader = csv.reader(f)
+                first_row = next(reader)
+                if not(url_or_none(first_row[url_column])):
+                    # it's a header row, skip it
+                for row in reader:
+                    url = row[0]
+                    logger.debug(f"Processing {url}")
+                    yield Metadata().set_url(url)
+            ArchivingContext.set("folder", "cli")
\ No newline at end of file
diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
index bbf06f6..8c8f9cb 100644
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@@ -40,5 +40,3 @@ class AtlosFeeder(Feeder):
 
             if len(data["results"]) == 0 or cursor is None:
                 break
-
-        logger.success(f"Processed {count} URL(s)")
diff --git a/src/auto_archiver/modules/cli_feeder/__init__.py b/src/auto_archiver/modules/cli_feeder/__init__.py
deleted file mode 100644
index 9c85787..0000000
--- a/src/auto_archiver/modules/cli_feeder/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .cli_feeder import CLIFeeder
\ No newline at end of file
diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py
deleted file mode 100644
index cf5c1b7..0000000
--- a/src/auto_archiver/modules/cli_feeder/__manifest__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "name": "CLI Feeder",
-    "type": ["feeder"],
-    "requires_setup": False,
-    "dependencies": {
-        "python": ["loguru"],
-    },
-    'entry_point': 'cli_feeder::CLIFeeder',
-    "configs": {
-        "urls": {
-            "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-            "nargs": "+",
-            "required": True,
-            "do_not_store": True,
-            "metavar": "INPUT URLS",
-        },
-    },
-    "description": """
-    Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
-
-    ### Features
-    - Takes a single URL or a list of URLs provided via the command line.
-    - Converts each URL into a `Metadata` object and yields it for processing.
-    - Ensures URLs are processed only if they are explicitly provided.
-
-    """
-}
diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
deleted file mode 100644
index 62cb659..0000000
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from loguru import logger
-
-from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
-
-
-class CLIFeeder(Feeder):
-
-    def __iter__(self) -> Metadata:
-        for url in self.urls:
-            logger.debug(f"Processing URL: '{url}'")
-            yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py
index b062ee6..7249395 100644
--- a/src/auto_archiver/modules/csv_feeder/__manifest__.py
+++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py
@@ -26,7 +26,6 @@
     - Supports reading URLs from multiple input files, specified as a comma-separated list.
     - Allows specifying the column number or name to extract URLs from.
     - Skips header rows if the first value is not a valid URL.
-    - Integrates with the `ArchivingContext` to manage URL feeding.
 
     ### Setu N
     - Input files should be formatted with one URL per line.
diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
index ad0a035..1cd9022 100644
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@@ -20,6 +20,4 @@ class CSVFeeder(Feeder):
                     url = row[0]
                     logger.debug(f"Processing {url}")
                     yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
\ No newline at end of file
+            ArchivingContext.set("folder", "cli")
\ No newline at end of file
diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
index e643c21..2879c05 100644
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -270,7 +270,11 @@ class GenericExtractor(Extractor):
             logger.debug('Using Facebook cookie')
             yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
 
-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
+                       'quiet': False, 'noplaylist': not self.allow_playlist ,
+                       'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
+                       "live_from_start": self.live_from_start, "proxy": self.proxy,
+                       "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
 
         if item.netloc in ['youtube.com', 'www.youtube.com']:
             if self.cookies_from_browser:
diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py
index bfc2efa..4da82c8 100644
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -7,7 +7,7 @@ import json
 import base64
 
 from auto_archiver.version import __version__
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
@@ -46,7 +46,7 @@ class HtmlFormatter(Formatter):
             version=__version__
         )
 
-        html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
+        html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
         with open(html_path, mode="w", encoding="utf-8") as outf:
             outf.write(content)
         final_media = Media(filename=html_path, _mimetype="text/html")
diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
index be775ce..8e7639a 100644
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -7,7 +7,7 @@ from selenium.common.exceptions import TimeoutException
 
 from auto_archiver.core import Enricher
 from auto_archiver.utils import Webdriver, UrlUtil, random_str
-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata
 
 class ScreenshotEnricher(Enricher):
 
@@ -23,11 +23,11 @@ class ScreenshotEnricher(Enricher):
             try:
                 driver.get(url)
                 time.sleep(int(self.sleep_before_screenshot))
-                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
+                screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
                 driver.save_screenshot(screenshot_file)
                 to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
                 if self.save_to_pdf:
-                    pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
+                    pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
                     pdf = driver.print_page(driver.print_options)
                     with open(pdf_file, "wb") as f:
                         f.write(base64.b64decode(pdf))
diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
index 52237ee..76784fa 100644
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
         logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
 
         cert = ssl.get_server_certificate((domain, 443))
-        cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
+        cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
         with open(cert_fn, "w") as f: f.write(cert)
         to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
index 8a08954..3e952e8 100644
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -9,7 +9,7 @@ from tqdm import tqdm
 import re, time, json, os
 
 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str
 
 
@@ -120,7 +120,7 @@ class TelethonArchiver(Extractor):
             media_posts = self._get_media_posts_in_group(chat, post)
             logger.debug(f'got {len(media_posts)=} for {url=}')
 
-            tmp_dir = ArchivingContext.get_tmp_dir()
+            tmp_dir = self.tmp_dir
 
             group_id = post.grouped_id if post.grouped_id is not None else post.id
             title = post.message
diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
index b27243b..429ba38 100644
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
         logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
         for m_id, m in enumerate(to_enrich.media[::]):
             if m.is_video():
-                folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
+                folder = os.path.join(self.tmp_dir, random_str(24))
                 os.makedirs(folder, exist_ok=True)
                 logger.debug(f"generating thumbnails for {m.filename}")
                 duration = m.get("duration")
diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
index a7a0aee..078c1ba 100644
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@@ -9,9 +9,7 @@ from asn1crypto import pem
 import certifi
 
 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext, Media
-from auto_archiver.core import Extractor
-
+from auto_archiver.core import Metadata, Media
 
 class TimestampingEnricher(Enricher):
     """
@@ -33,7 +31,7 @@ class TimestampingEnricher(Enricher):
             logger.warning(f"No hashes found in {url=}")
             return
         
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
         hashes_fn = os.path.join(tmp_dir, "hashes.txt")
 
         data_to_sign = "\n".join(hashes)
@@ -93,7 +91,7 @@ class TimestampingEnricher(Enricher):
 
         cert_chain = []
         for cert in path:
-            cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
+            cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
             with open(cert_fn, "wb") as f:
                 f.write(cert.dump())
             cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
index 301fa89..2d09138 100644
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
 
 from auto_archiver.utils.misc import dump_payload
 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 
 
 class VkExtractor(Extractor):
@@ -35,7 +35,7 @@ class VkExtractor(Extractor):
 
         result.set_content(dump_payload(vk_scrapes))
 
-        filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
+        filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
         for filename in filenames:
             result.add_media(Media(filename))
 
diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
index 8810b84..3f67b7c 100644
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@@ -5,7 +5,7 @@ from zipfile import ZipFile
 from loguru import logger
 from warcio.archiveiterator import ArchiveIterator
 
-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata
 from auto_archiver.core import Extractor, Enricher
 from auto_archiver.utils import UrlUtil, random_str
 
@@ -51,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
         url = to_enrich.get_url()
 
         collection = random_str(8)
-        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
+        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
         browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
 
         cmd = [
@@ -154,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
         logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")
 
         # unzipping the .wacz
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
         unzipped_dir = os.path.join(tmp_dir, "unzipped")
         with ZipFile(wacz_filename, 'r') as z_obj:
             z_obj.extractall(path=unzipped_dir)
diff --git a/tests/__init__.py b/tests/__init__.py
index 3d66aff..31f38cb 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -2,5 +2,4 @@ import tempfile
 
 from auto_archiver.core.context import ArchivingContext
 
-ArchivingContext.reset(full_reset=True)
-ArchivingContext.set_tmp_dir(tempfile.gettempdir())
\ No newline at end of file
+ArchivingContext.reset(full_reset=True)
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index af0fd6d..3bd382b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,6 +2,7 @@
 pytest conftest file, for shared fixtures and configuration
 """
 
+from tempfile import TemporaryDirectory
 from typing import Dict, Tuple
 import hashlib
 import pytest
@@ -25,8 +26,13 @@ def setup_module(request):
 
         m = get_module(module_name, {module_name: config})
 
+        # add the tmp_dir to the module
+        tmp_dir = TemporaryDirectory()
+        m.tmp_dir = tmp_dir
+
         def cleanup():
             _LAZY_LOADED_MODULES.pop(module_name)
+            tmp_dir.cleanup()
         request.addfinalizer(cleanup)
 
         return m
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index 03cb521..68417aa 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -1,6 +1,6 @@
 import pytest
 import sys
-from argparse import ArgumentParser
+from argparse import ArgumentParser, ArgumentTypeError
 from auto_archiver.core.orchestrator import ArchivingOrchestrator
 from auto_archiver.version import __version__
 from auto_archiver.core.config import read_yaml, store_yaml
@@ -113,16 +113,23 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
 
     # run the orchestrator
     orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES])
+    assert orchestrator.config is not None
 
-    # should run OK, since there are no missing required fields
+def test_load_authentication_string(orchestrator, test_args):
 
-    # basic_args = basic_parser.parse_known_args(test_args)
-    # test_yaml = read_yaml(TEST_ORCHESTRATION)
-    # test_yaml['example_module'] = {'required_field': 'some_value'}
+    orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
+    assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
 
-    # # monkey patch the example_module to have a 'configs' setting of 'my_var' with required=True
-    # # load the module first
-    # m = get_module_lazy("example_module")
+def test_load_authentication_string_concat_site(orchestrator, test_args):
+    
+    orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
+    assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"},
+                                                     "twitter.com": {"api_key": "my_key"}}
 
-    # orchestrator.setup_complete_parser(basic_args, test_yaml, unused_args=[])
-    # assert orchestrator.config is not None
\ No newline at end of file
+def test_load_invalid_authentication_string(orchestrator, test_args):
+    with pytest.raises(ArgumentTypeError):
+        orchestrator.run(test_args + ["--authentication", "{\''invalid_json"])
+
+def test_load_authentication_invalid_dict(orchestrator, test_args):
+    with pytest.raises(ArgumentTypeError):
+        orchestrator.run(test_args + ["--authentication", "[true, false]"])
\ No newline at end of file