Various fixes for issues with new architecture (#208)

* Add formatters to the TOC - fixes #204 * Add 'steps' settings to the example YAML in the docs. Fixes #206 * Improved docs on authentication architecture * Fix setting modules on the command line - they now override any module settings in the orchestration as opposed to appending * Fix tests for gsheet-feeder: add a test service_account.json (note: not real keys in there) * Rename the command line entrypoint to _command_line_run Also: make it clear that code implementation should not call this Make sure the command line entry returns (we don't want a generator) * Fix unit tests to use now code-entry points * Version bump * Move iterating of generator up to __main__ * Breakpoint * two minor fixes * Fix unit tests + add new '__main__' entry point implementation test * Skip youtube tests if running on CI. Should still run them locally * Fix full implementation run on GH actions * Fix skipif test for GH Actions CI * Add skipifs for truth - it blocks GH: --------- Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2025-02-18 19:10:09 +00:00 · 2025-02-18 19:10:09 +00:00 · 3c543a3a6a
commit 3c543a3a6a
--- a/docs/scripts/scripts.py
+++ b/docs/scripts/scripts.py
@ -19,6 +19,19 @@ type_color = {

 TABLE_HEADER = ("Option", "Description", "Default", "Type")

+EXAMPLE_YAML = """
+# steps configuration
+steps:
+...
+{steps_str}
+...
+
+# module configuration
+...
+
+{config_string}
+"""
+
 def generate_module_docs():
    yaml = YAML()
    SAVE_FOLDER.mkdir(exist_ok=True)
@ -45,11 +58,14 @@ def generate_module_docs():
 ```
 {description}
 """     
+        steps_str = "\n".join(f"  {t}s:\n  - {module.name}" for t in manifest['type'])
+
        if not manifest['configs']:
-            readme_str += "\n*This module has no configuration options.*\n"
+            config_string = f"# No configuration options for {module.name}.*\n"
        else:
-            config_yaml = {}
+
            config_table = header_row
+            config_yaml = {}
            for key, value in manifest['configs'].items():
                type = value.get('type', 'string')
                if type == 'auto_archiver.utils.json_loader':
@ -65,11 +81,14 @@ def generate_module_docs():
                configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
            readme_str += "\n## Configuration Options\n"
            readme_str += "\n### YAML\n"
-            yaml_string = io.BytesIO()
-            yaml.dump({module.name: config_yaml}, yaml_string)
-            
-            readme_str += f"```{{code}} yaml\n{yaml_string.getvalue().decode('utf-8')}\n```\n"

+            config_string = io.BytesIO()
+            yaml.dump({module.name: config_yaml}, config_string)
+            config_string = config_string.getvalue().decode('utf-8')
+        yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string)
+        readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n"
+
+        if manifest['configs']:
            readme_str += "\n### Command Line:\n"
            readme_str += config_table

@ -103,3 +122,7 @@ def generate_index(modules_by_type):
    with open(SAVE_FOLDER / "module_list.md", "w") as f:
        print("writing", SAVE_FOLDER / "module_list.md")
        f.write(readme_str)
+
+
+if __name__ == "__main__":
+    generate_module_docs()
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -77,3 +77,6 @@ html_theme = 'sphinx_book_theme'
 html_static_path = ["../_static"]
 html_css_files = ["custom.css"]

+copybutton_prompt_text = r">>> |\.\.\."
+copybutton_prompt_is_regexp = True
+copybutton_only_copy_prompt_lines = False
--- a/docs/source/core_modules.md
+++ b/docs/source/core_modules.md
@ -24,4 +24,5 @@ modules/extractor
 modules/enricher
 modules/storage
 modules/database
+modules/formatter
 ```
--- a/docs/source/how_to.md
+++ b/docs/source/how_to.md
@ -45,3 +45,10 @@ The "archive location" link contains the path of the archived file, in local sto
 ![The archive result for a link in the demo sheet.](../demo-archive.png)

 ---
+
+```{toctree}
+:maxdepth: 1
+:glob:
+
+how_to/*
+```
--- a/docs/source/how_to/authentication.md
+++ b/docs/source/how_to/authentication.md
@ -0,0 +1,57 @@
+# Authentication
+
+The Authentication framework for auto-archiver allows you to add login details for various websites in a flexible way, directly from the configuration file.
+
+There are two main use cases for authentication:
+* Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
+* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this.
+
+## The Authentication Config
+
+You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same.
+
+```{code} yaml
+authentication:
+   # optional file to load authentication information from, for security or multi-system deploy purposes
+   load_from_file: path/to/authentication/file.txt
+   # optional setting to load cookies from the named browser on the system.
+   cookies_from_browser: firefox
+   # optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these
+   cookies_file: path/to/cookies.jar
+
+   twitter.com,x.com:
+      username: myusername
+      password: 123
+    
+    facebook.com:
+       cookie: single_cookie
+
+    othersite.com:
+       api_key: 123
+       api_secret: 1234
+
+# All available options:
+  # - username: str - the username to use for login
+  # - password: str - the password to use for login
+  # - api_key: str - the API key to use for login
+  # - api_secret: str - the API secret to use for login
+  # - cookie: str - a cookie string to use for login (specific to this site)
+```
+
+### Recommendations for authentication
+
+1. **Store authentication information separately:**
+The authentication part of your configuration contains sensitive information. You should make efforts not to share this with others. For extra security, use the `load_from_file` option to keep your authentication settings out of your configuration file, ideally in a different folder.
+
+2. **Don't use your own personal credentials**
+Depending on the website you are extracting information from, there may be rules (Terms of Service) that prohibit you from scraping or extracting information using a bot. If you use your own personal account, there's a possibility it might get blocked/disabled. It's recommended to set up a separate, 'throwaway' account. In that way, if it gets blocked you can easily create another one to continue your archiving.
+
+
+### How to create a cookies.jar or pass cookies directly to auto-archiver
+
+auto-archiver uses yt-dlp's powerful cookies features under the hood. For instructions on how to extract a cookies.jar (or cookies.txt) file directly from your browser, see the FAQ in the [yt-dlp documentation](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp)
+
+```{note} For developers:
+
+For information on how to access and use authentication settings from within your module, see the `{generic_extractor}` for an example, or view the [`auth_for_site()` function in BaseModule](../autoapi/core/base_module/index.rst)
+```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [project]
 name = "auto-archiver"
-version = "0.13.2"
+version = "0.13.3"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."

 requires-python = ">=3.10,<3.13"
--- a/src/auto_archiver/main.py
+++ b/src/auto_archiver/main.py
@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
 import sys

 def main():
-    ArchivingOrchestrator().run(sys.argv[1:])
+    for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass

 if __name__ == "__main__":
    main()
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@ -63,12 +63,6 @@ class BaseModule(ABC):
    def config_setup(self, config: dict):

        authentication = config.get('authentication', {})
-        # extract out concatenated sites
-        for key, val in copy(authentication).items():
-            if "," in key:
-                for site in key.split(","):
-                    authentication[site] = val
-                del authentication[key]

        # this is important. Each instance is given its own deepcopied config, so modules cannot
        # change values to affect other modules
@ -89,16 +83,21 @@ class BaseModule(ABC):
        Returns the authentication information for a given site. This is used to authenticate
        with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
        
-        extract_cookies: bool - whether or not to extract cookies from the given browser and return the 
-        cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
+        :param site: the domain of the site to get authentication information for
+        :param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).

-        Currently, the dict can have keys of the following types:
-        - username: str - the username to use for login
-        - password: str - the password to use for login
-        - api_key: str - the API key to use for login
-        - api_secret: str - the API secret to use for login
-        - cookie: str - a cookie string to use for login (specific to this site)
-        - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
+        :returns: authdict dict of login information for the given site
+
+        **Global options:**\n
+        * cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
+        * cookies_file: str - the path to a cookies file to use for login\n
+
+        **Currently, the sites dict can have keys of the following types:**\n
+        * username: str - the username to use for login\n
+        * password: str - the password to use for login\n
+        * api_key: str - the API key to use for login\n
+        * api_secret: str - the API secret to use for login\n
+        * cookie: str - a cookie string to use for login (specific to this site)\n
        """
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -129,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
                yaml_subdict[key] = value
                continue

+            if key == 'steps':
+                for module_type, modules in value.items():
+                    # overwrite the 'steps' from the config file with the ones from the CLI
+                    yaml_subdict[key][module_type] = modules
+
            if is_dict_type(value):
                update_dict(value, yaml_subdict[key])
            elif is_list_type(value):
@ -137,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
                yaml_subdict[key] = value

    update_dict(from_dot_notation(dotdict), yaml_dict)
-
    return yaml_dict

 def read_yaml(yaml_filename: str) -> CommentedMap:
@ -159,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
 def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
    config_to_save = deepcopy(config)

+    auth_dict = config_to_save.get("authentication", {})
+    if auth_dict and auth_dict.get('load_from_file'):
+        # remove all other values from the config, don't want to store it in the config file
+        auth_dict = {"load_from_file": auth_dict["load_from_file"]}
+
    config_to_save.pop('urls', None)
    with open(yaml_filename, "w", encoding="utf-8") as outf:
        _yaml.dump(config_to_save, outf)
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -8,6 +8,7 @@ from __future__ import annotations
 from typing import Generator, Union, List, Type
 from urllib.parse import urlparse
 from ipaddress import ip_address
+from copy import copy
 import argparse
 import os
 import sys
@ -43,30 +44,50 @@ class AuthenticationJsonParseAction(JsonParseAction):
    def __call__(self, parser, namespace, values, option_string=None):
        super().__call__(parser, namespace, values, option_string)
        auth_dict = getattr(namespace, self.dest)
-        if isinstance(auth_dict, str):
-            # if it's a string
+
+        def load_from_file(path):
            try:
-                with open(auth_dict, 'r') as f:
+                with open(path, 'r') as f:
                    try:
                        auth_dict = json.load(f)
                    except json.JSONDecodeError:
+                        f.seek(0)
                        # maybe it's yaml, try that
                        auth_dict = _yaml.load(f)
+                    if auth_dict.get('authentication'):
+                        auth_dict = auth_dict['authentication']
+                    auth_dict['load_from_file']  = path
+                    return auth_dict
            except:
-                pass
+                return None

+        if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
+            auth_dict = load_from_file(auth_dict['from_file'])
+        elif isinstance(auth_dict, str):
+            # if it's a string
+            auth_dict = load_from_file(auth_dict)
+        
        if not isinstance(auth_dict, dict):
            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
-        for site, auth in auth_dict.items():
-            if not isinstance(site, str) or not isinstance(auth, dict):
-                raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
+        for key, auth in auth_dict.items():
+            if key in global_options:
+                continue
+            if not isinstance(key, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
+        
+        # extract out concatenated sites
+        for key, val in copy(auth_dict).items():
+            if "," in key:
+                for site in key.split(","):
+                    auth_dict[site] = val
+                del auth_dict[key]
+
        setattr(namespace, self.dest, auth_dict)


 class UniqueAppendAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
-        if not hasattr(namespace, self.dest):
-            setattr(namespace, self.dest, [])
        for value in values:
            if value not in getattr(namespace, self.dest):
                getattr(namespace, self.dest).append(value)
@ -104,36 +125,50 @@ class ArchivingOrchestrator:
        return parser

    def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
+
+
+        # modules parser to get the overridden 'steps' values
+        modules_parser = argparse.ArgumentParser(
+            add_help=False,
+        )
+        self.add_modules_args(modules_parser)
+        cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
+        for module_type in BaseModule.MODULE_TYPES:
+            yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
+
        parser = DefaultValidatingParser(
            add_help=False,
        )
        self.add_additional_args(parser)

+        # merge command line module args (--feeders, --enrichers etc.) and add them to the config
+
        # check what mode we're in
        # if we have a config file, use that to decide which modules to load
        # if simple, we'll load just the modules that has requires_setup = False
        # if full, we'll load all modules
        # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
        # but should we add them? Or should we just add them to the 'complete' parser?
+
        if yaml_config != EMPTY_CONFIG:
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
            # first loads the modules from the config file, then from the command line
-            for config in [yaml_config['steps'], basic_config.__dict__]:
-                for module_type in BaseModule.MODULE_TYPES:
-                    enabled_modules.extend(config.get(f"{module_type}s", []))
+            for module_type in BaseModule.MODULE_TYPES:
+                enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))

            # clear out duplicates, but keep the order
            enabled_modules = list(dict.fromkeys(enabled_modules))
            avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
-            self.add_module_args(avail_modules, parser)
+            self.add_individual_module_args(avail_modules, parser)
        elif basic_config.mode == 'simple':
            simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
-            self.add_module_args(simple_modules, parser)
+            self.add_individual_module_args(simple_modules, parser)

            # for simple mode, we use the cli_feeder and any modules that don't require setup
-            yaml_config['steps']['feeders'] = ['cli_feeder']
+            if not yaml_config['steps']['feeders']:
+                yaml_config['steps']['feeders'] = ['cli_feeder']

            # add them to the config
            for module in simple_modules:
@ -141,30 +176,38 @@ class ArchivingOrchestrator:
                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
        else:
            # load all modules, they're not using the 'simple' mode
-            self.add_module_args(available_modules(with_manifest=True), parser)
-
+            self.add_individual_module_args(available_modules(with_manifest=True), parser)
+        
        parser.set_defaults(**to_dot_notation(yaml_config))

        # reload the parser with the new arguments, now that we have them
        parsed, unknown = parser.parse_known_args(unused_args)
-
        # merge the new config with the old one
-        self.config = merge_dicts(vars(parsed), yaml_config)
+        config = merge_dicts(vars(parsed), yaml_config)
+
        # clean out args from the base_parser that we don't want in the config
        for key in vars(basic_config):
-            self.config.pop(key, None)
+            config.pop(key, None)

        # setup the logging
-        self.setup_logging()
+        self.setup_logging(config)

        if unknown:
            logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")

-        if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
+        if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
            logger.info(f"Storing configuration file to {basic_config.config_file}")
-            store_yaml(self.config, basic_config.config_file)
+            store_yaml(config, basic_config.config_file)

-        return self.config
+        return config
+    
+    def add_modules_args(self, parser: argparse.ArgumentParser = None):
+        if not parser:
+            parser = self.parser
+
+        # Module loading from the command line
+        for module_type in BaseModule.MODULE_TYPES:
+            parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)

    def add_additional_args(self, parser: argparse.ArgumentParser = None):
        if not parser:
@ -173,30 +216,24 @@ class ArchivingOrchestrator:
        # allow passing URLs directly on the command line
        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')

-        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
-        parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
-        parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
-        parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
-        parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
-        parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
-
        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
                                                                            (token, username etc.) that extractors can use to log into \
                                                                            a website. If passing this on the command line, use a JSON string. \
                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',
                                                                            default={},
+                                                                            nargs="?",
                                                                            action=AuthenticationJsonParseAction)
+
        # logging arguments
        parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
        parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
        parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)

-    def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
+    def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:

        if not modules:
            modules = available_modules(with_manifest=True)
-
-        module: LazyBaseModule
+        
        for module in modules:

            if not module.configs:
@ -226,18 +263,19 @@ class ArchivingOrchestrator:
                arg.should_store = should_store

    def show_help(self, basic_config: dict):
-        # for the help message, we want to load *all* possible modules and show the help
+        # for the help message, we want to load manifests from *all* possible modules and show their help/settings
        # add configs as arg parser arguments

+        self.add_modules_args(self.basic_parser)
        self.add_additional_args(self.basic_parser)
-        self.add_module_args(parser=self.basic_parser)
+        self.add_individual_module_args(parser=self.basic_parser)
        self.basic_parser.print_help()
        self.basic_parser.exit()

-    def setup_logging(self):
+    def setup_logging(self, config):
        # setup loguru logging
        logger.remove(0)  # remove the default logger
-        logging_config = self.config['logging']
+        logging_config = config['logging']
        logger.add(sys.stderr, level=logging_config['level'])
        if log_file := logging_config['file']:
            logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
@ -318,9 +356,9 @@ class ArchivingOrchestrator:

        return read_yaml(config_file)
    
-    def setup(self, args: list):
+    def setup_config(self, args: list) -> dict:
        """
-        Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
+        Sets up the configuration file, merging the default config with the user's config
        """
        self.setup_basic_parser()

@ -333,9 +371,16 @@ class ArchivingOrchestrator:
        # if help flag was called, then show the help
        if basic_config.help:
            self.show_help(basic_config)
-
+        # merge command line --feeder etc. args with what's in the yaml config
        yaml_config = self.load_config(basic_config.config_file)
-        self.setup_complete_parser(basic_config, yaml_config, unused_args)
+
+        return self.setup_complete_parser(basic_config, yaml_config, unused_args)
+
+    def setup(self, args: list):
+        """
+        Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
+        """
+        self.config = self.setup_config(args)

        logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
        self.install_modules(self.config['steps'])
@ -344,8 +389,18 @@ class ArchivingOrchestrator:
        for module_type in BaseModule.MODULE_TYPES:
            logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))

-    def run(self, args: list) -> Generator[Metadata]:
+    def _command_line_run(self, args: list) -> Generator[Metadata]:
+        """
+        This is the main entry point for the orchestrator, when run from the command line.

+        :param args: list of arguments to pass to the orchestrator - these are the command line args
+        
+        You should not call this method from code implementations.
+          
+        This method sets up the configuration, loads the modules, and runs the feed.
+        If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
+        To test configurations, without loading any modules you can also first call 'setup_configs'
+        """
        self.setup(args)
        return self.feed()

--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@ -10,7 +10,7 @@
        "sheet": {"default": None, "help": "name of the sheet to archive"},
        "sheet_id": {
            "default": None,
-            "help": "(alternative to sheet name) the id of the sheet to archive",
+            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
--- a/src/auto_archiver/modules/html_formatter/templates/html_template.html
+++ b/src/auto_archiver/modules/html_formatter/templates/html_template.html
@ -200,7 +200,7 @@
                el.innerHTML = decodeCertificate(certificate);

                let cyberChefUrl =
-                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
+                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
                // create a new anchor with this url and append after the code
                let a = document.createElement("a");
                a.href = cyberChefUrl;
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@ -4,7 +4,6 @@
    "requires_setup": True,
    "dependencies": {
        "python": ["loguru", "selenium"],
-        "bin": ["geckodriver"]
    },
    "configs": {
            "width": {"default": 1280, "help": "width of the screenshots"},
--- a/tests/data/test_service_account.json
+++ b/tests/data/test_service_account.json
@ -0,0 +1,14 @@
+{
+    "type": "service_account",
+    "project_id": "some-project-id",
+    "private_key_id": "some-private-key-id",
+    "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDPlcaFJgt7HzoC\n4z0b18PzI2R5c892mLnNwRO8DOKid5INt6z5RAWKDPdnIyHjRBx74qNZl6768pia\nztQNgnud7mKcmvOvGrpUbFx2BdAw8xTyAlRVMalOBhUS9RKvjP5WgSwR5EKwfvzy\nrGioC6ml/segz5EchSaIzgASwB17ir0w6IrymBxUeNelfzCGJpCRhqG5nG+eEjct\nUYU0QIyihRD1Lq0f3Z3D0xfTLLZ630iFBj/Wr0BCJHkl6hdVuGhnyn4S98sMX1Bd\ntaJF/lWi4jdt7SoXD3+FWv66kHPpFfINMpReuB9u0ogfYkORgiRBOMhYBkGGQjUG\nOnBTxEc3AgMBAAECgf9bKiK8DdSz0ALzQbRLhgj2B9485jHI49wjgINOyceZ23uS\nQYXaO+DFLcgLqBkVSGanuHMpU0+qCpeM0v9yXSTIW8RguWMnFd8ID/yLRktxfQa1\n1FAQh+NlF4/gnuUoM8N/FYSy6R5grfaxwU8Qfg66IQXUB52OezSVu5lxNO4G5Rwv\nJ2e/+XYBUv/H26BnQSmjFCzbJkdbtrOeThpaLwLexKcollvoHKGyus0jpWg4C9Ez\n9EJaE+on4nd+cM1Vd+dWaHXoZ9Db9IvxPBqFJE8fynap7RDBeZK678OuCvQntrp4\nrTsE9hW8073Jhl/LbhfbDC0lhFR0JUHygVGE01ECgYEA+g+ddpGGY90yhhM76bTr\nkU6WwislMmfS0WDdLPemNgzLwCtkC2vsQgzg/egxqkVF5dJ9upiFhVgpYxY7ap9U\nSGFemb6T1ASl/1yeNhd0yc4PZFsJ29k+kNgSIlJYm9KDCIMqS1wPoXvFQhbMitOf\n/gLCPugxl67c+qg6nfuODTkCgYEA1IPngESOJnV8oa2WReWrO6+u6xb/OhqdmBzI\n5yq1z3f5gb98XESZR/rCH2vAOmHIJPn3XdZHsznOuxhZwGr1oztiRIurLmBlxQoL\n7tq0jDOUVSD2yeyQwKt5LaBH94P598FiauGxXM4raREWKtcNBGoOX1u1+kEBsoL4\ntf10Z+8CgYEA3QFkB+ECR8y91KW3NAzEjj5JG/8J9wyv1IGpuQ5/hhG1Gni/CSEv\nRAkh6QaIrpZe+ooYuQwIJhwPKBYEGW4MDZSRCYzYFnCtTY5L/j6o55sJG4cipX3R\nwC5XiKIC0mUxjhpvDP+miPBdHNYNnT0AkH1btEF/YzIW+Coq9GnZ2HECgYAOOpax\ne+WYpZ0mphy9qVcBtA2eJ/gGx+ltWeAJuk5aCcpm6Y9GDkHFFAETYX+JaSqhbysk\n2UgLs/8nf8XioEa6GyvFMyTPAh1OSBHseDBGgt2XpZFgi7pVbCW87FJlPCzsbcJN\nLbdWY2d8rWwyihuRBBjaQaW5j8ixTxuf88xreQKBgQCST4Fr8C5CkpakTA+KOost\nLOlziUBm0534mTg7dTcOE1H1+gxtqpXlXcJylpGz1lUXRlHCIutN5iPJcN5cxFES\nsP7wBd7BhficsMKDiWPm9XbP2zXVZu0ldUxA1mONMsS1P4p7i3Dh4uzrRDmSkTUL\njUpppYDumg3oM7wSJ6sTQA==\n-----END PRIVATE KEY-----",
+    "client_email": "some-email",
+    "client_id": "some-client-email",
+    "auth_uri": "https://example.com/o/oauth2/auth",
+    "token_uri": "https://oauth2.example.com/token",
+    "auth_provider_x509_cert_url": "https://www.example.com/oauth2/v1/certs",
+    "client_x509_cert_url": "https://www.example.com/robot/v1/metadata/x509/some-email",
+    "universe_domain": "example.com"
+  }
+  
--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@ -9,6 +9,7 @@ import pytest
 from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
 from .test_extractor_base import TestExtractorBase

+CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
 class TestGenericExtractor(TestExtractorBase):
    """Tests Generic Extractor
    """
@ -77,10 +78,11 @@ class TestGenericExtractor(TestExtractorBase):
        result = self.extractor.download(item)
        assert not result

-
+    @pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
    @pytest.mark.download
    def test_youtube_download(self, make_item):
        # url https://www.youtube.com/watch?v=5qap5aO4i9A
+
        item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
        result = self.extractor.download(item)
        assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
@ -114,6 +116,7 @@ class TestGenericExtractor(TestExtractorBase):
        result = self.extractor.download(item)
        assert result is not False
    
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_video(self, make_item):
        item = make_item("https://truthsocial.com/@DaynaTrueman/posts/110602446619561579")
@ -121,18 +124,21 @@ class TestGenericExtractor(TestExtractorBase):
        assert len(result.media) == 1
        assert result is not False

+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_no_media(self, make_item):
        item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
        result = self.extractor.download(item)
        assert result is not False
    
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_poll(self, make_item):
        item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
        result = self.extractor.download(item)
        assert result is not False
    
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_single_image(self, make_item):
        item = make_item("https://truthsocial.com/@mariabartiromo/posts/113861116433335006")
@ -140,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
        assert len(result.media) == 1
        assert result is not False

+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_multiple_images(self, make_item):
        item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")
--- a/tests/extractors/test_twitter_api_extractor.py
+++ b/tests/extractors/test_twitter_api_extractor.py
@ -34,7 +34,7 @@ class TestTwitterApiExtractor(TestExtractorBase):

    @pytest.mark.download
    def test_sanitize_url_download(self):
-        assert "https://t.co/yl3oOJatFp" == self.extractor.sanitize_url("https://www.bellingcat.com/category/resources/")
+        assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")

    @pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
        ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
--- a/tests/test_implementation.py
+++ b/tests/test_implementation.py
@ -60,3 +60,15 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):

    # should treat an empty file as if there is no file at all
    assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
+
+def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
+    from auto_archiver.__main__ import main
+
+    # monkey patch to change the current working directory, so that we don't use the user's real config file
+    monkeypatch.chdir(tmp_path)
+    with monkeypatch.context() as m:
+        m.setattr(sys, "argv", ["auto-archiver"])
+        with pytest.raises(SystemExit):
+            main()
+
+    assert "No URLs provided. Please provide at least one" in caplog.text
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@ -75,18 +75,36 @@ def test_help(orchestrator, basic_parser, capsys):
        orchestrator.show_help(args)

    assert exit_error.value.code == 0
-    assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in capsys.readouterr().out
+
+    logs = capsys.readouterr().out
+    assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in logs
+
+    # basic config options
+    assert "--version" in logs
+
+    # setting modules options
+    assert "--feeders" in logs
+    assert "--extractors" in logs
+
+    # authentication options
+    assert "--authentication" in logs
+
+    # logging options
+    assert "--logging.level" in logs
+
+    # individual module configs
+    assert "--gsheet_feeder.sheet_id" in logs


 def test_add_custom_modules_path(orchestrator, test_args):
-    orchestrator.run(test_args)
+    orchestrator.setup_config(test_args)
    
    import auto_archiver
    assert "tests/data/test_modules/" in auto_archiver.modules.__path__

 def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):

-    orchestrator.run(test_args +  # we still need to load the real path to get the example_module 
+    orchestrator.setup_config(test_args +  # we still need to load the real path to get the example_module 
                          ["--module_paths", "tests/data/invalid_test_modules/"])

    assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
@ -97,7 +115,7 @@ def test_check_required_values(orchestrator, caplog, test_args):
    test_args = test_args[:-2]

    with pytest.raises(SystemExit) as exit_error:
-        orchestrator.run(test_args)
+        config = orchestrator.setup_config(test_args)

    assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"

@ -111,24 +129,50 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
    store_yaml(test_yaml, tmp_file)

    # run the orchestrator
-    orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES])
-    assert orchestrator.config is not None
+    config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
+    assert config is not None

 def test_load_authentication_string(orchestrator, test_args):

-    orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
-    assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
+    config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
+    assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}

 def test_load_authentication_string_concat_site(orchestrator, test_args):
    
-    orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
-    assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"},
+    config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
+    assert config['authentication'] == {"x.com": {"api_key": "my_key"},
                                                     "twitter.com": {"api_key": "my_key"}}

 def test_load_invalid_authentication_string(orchestrator, test_args):
    with pytest.raises(ArgumentTypeError):
-        orchestrator.run(test_args + ["--authentication", "{\''invalid_json"])
+        orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])

 def test_load_authentication_invalid_dict(orchestrator, test_args):
    with pytest.raises(ArgumentTypeError):
-        orchestrator.run(test_args + ["--authentication", "[true, false]"])
+        orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])
+
+def test_load_modules_from_commandline(orchestrator, test_args):
+    args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
+
+    orchestrator.setup(args)
+
+    assert len(orchestrator.feeders) == 1
+    assert len(orchestrator.extractors) == 1
+    assert len(orchestrator.databases) == 1
+    assert len(orchestrator.enrichers) == 1
+    assert len(orchestrator.formatters) == 1
+
+    assert orchestrator.feeders[0].name == "example_module"
+    assert orchestrator.extractors[0].name == "example_module"
+    assert orchestrator.databases[0].name == "example_module"
+    assert orchestrator.enrichers[0].name == "example_module"
+    assert orchestrator.formatters[0].name == "example_module"
+
+def test_load_settings_for_module_from_commandline(orchestrator, test_args):
+    args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
+
+    orchestrator.setup(args)
+
+    assert len(orchestrator.feeders) == 1
+    assert orchestrator.feeders[0].name == "gsheet_feeder"
+    assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"