From 3c543a3a6a1d49fdc01337593756006c6515d9aa Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Tue, 18 Feb 2025 19:10:09 +0000
Subject: [PATCH] Various fixes for issues with new architecture (#208)

* Add formatters to the TOC - fixes #204

* Add 'steps' settings to the example YAML in the docs. Fixes #206

* Improved docs on authentication architecture

* Fix setting modules on the command line - they now override any module settings in the orchestration as opposed to appending

* Fix tests for gsheet-feeder: add a test service_account.json (note: not real keys in there)

* Rename the command line entrypoint to _command_line_run

Also: make it clear that code implementation should not call this
Make sure the command line entry returns (we don't want a generator)

* Fix unit tests to use now code-entry points

* Version bump

* Move iterating of generator up to __main__

* Breakpoint

* two minor fixes

* Fix unit tests + add new '__main__' entry point implementation test

* Skip youtube tests if running on CI. Should still run them locally

* Fix full implementation run on GH actions

* Fix skipif test for GH Actions CI

* Add skipifs for truth - it blocks GH:

---------

Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
---
 docs/scripts/scripts.py                       |  35 ++++-
 docs/source/conf.py                           |   3 +
 docs/source/core_modules.md                   |   1 +
 docs/source/how_to.md                         |   7 +
 docs/source/how_to/authentication.md          |  57 +++++++
 pyproject.toml                                |   2 +-
 src/auto_archiver/__main__.py                 |   2 +-
 src/auto_archiver/core/base_module.py         |  29 ++--
 src/auto_archiver/core/config.py              |  11 +-
 src/auto_archiver/core/orchestrator.py        | 141 ++++++++++++------
 .../modules/gsheet_feeder/__manifest__.py     |   2 +-
 .../templates/html_template.html              |   2 +-
 .../screenshot_enricher/__manifest__.py       |   1 -
 tests/data/test_service_account.json          |  14 ++
 tests/extractors/test_generic_extractor.py    |   9 +-
 .../extractors/test_twitter_api_extractor.py  |   2 +-
 tests/test_implementation.py                  |  12 ++
 tests/test_orchestrator.py                    |  68 +++++++--
 18 files changed, 314 insertions(+), 84 deletions(-)
 create mode 100644 docs/source/how_to/authentication.md
 create mode 100644 tests/data/test_service_account.json

diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py
index f73315b..9712439 100644
--- a/docs/scripts/scripts.py
+++ b/docs/scripts/scripts.py
@@ -19,6 +19,19 @@ type_color = {
 
 TABLE_HEADER = ("Option", "Description", "Default", "Type")
 
+EXAMPLE_YAML = """
+# steps configuration
+steps:
+...
+{steps_str}
+...
+
+# module configuration
+...
+
+{config_string}
+"""
+
 def generate_module_docs():
     yaml = YAML()
     SAVE_FOLDER.mkdir(exist_ok=True)
@@ -45,11 +58,14 @@ def generate_module_docs():
 ```
 {description}
 """     
+        steps_str = "\n".join(f"  {t}s:\n  - {module.name}" for t in manifest['type'])
+
         if not manifest['configs']:
-            readme_str += "\n*This module has no configuration options.*\n"
+            config_string = f"# No configuration options for {module.name}.*\n"
         else:
-            config_yaml = {}
+
             config_table = header_row
+            config_yaml = {}
             for key, value in manifest['configs'].items():
                 type = value.get('type', 'string')
                 if type == 'auto_archiver.utils.json_loader':
@@ -65,11 +81,14 @@ def generate_module_docs():
                 configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
             readme_str += "\n## Configuration Options\n"
             readme_str += "\n### YAML\n"
-            yaml_string = io.BytesIO()
-            yaml.dump({module.name: config_yaml}, yaml_string)
-            
-            readme_str += f"```{{code}} yaml\n{yaml_string.getvalue().decode('utf-8')}\n```\n"
 
+            config_string = io.BytesIO()
+            yaml.dump({module.name: config_yaml}, config_string)
+            config_string = config_string.getvalue().decode('utf-8')
+        yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string)
+        readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n"
+
+        if manifest['configs']:
             readme_str += "\n### Command Line:\n"
             readme_str += config_table
 
@@ -103,3 +122,7 @@ def generate_index(modules_by_type):
     with open(SAVE_FOLDER / "module_list.md", "w") as f:
         print("writing", SAVE_FOLDER / "module_list.md")
         f.write(readme_str)
+
+
+if __name__ == "__main__":
+    generate_module_docs()
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e7093c4..5b1ad9b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -77,3 +77,6 @@ html_theme = 'sphinx_book_theme'
 html_static_path = ["../_static"]
 html_css_files = ["custom.css"]
 
+copybutton_prompt_text = r">>> |\.\.\."
+copybutton_prompt_is_regexp = True
+copybutton_only_copy_prompt_lines = False
\ No newline at end of file
diff --git a/docs/source/core_modules.md b/docs/source/core_modules.md
index 4ee3bfc..3a8e5ec 100644
--- a/docs/source/core_modules.md
+++ b/docs/source/core_modules.md
@@ -24,4 +24,5 @@ modules/extractor
 modules/enricher
 modules/storage
 modules/database
+modules/formatter
 ```
\ No newline at end of file
diff --git a/docs/source/how_to.md b/docs/source/how_to.md
index bf3b9fc..25e1e1d 100644
--- a/docs/source/how_to.md
+++ b/docs/source/how_to.md
@@ -45,3 +45,10 @@ The "archive location" link contains the path of the archived file, in local sto
 ![The archive result for a link in the demo sheet.](../demo-archive.png)
 
 ---
+
+```{toctree}
+:maxdepth: 1
+:glob:
+
+how_to/*
+```
\ No newline at end of file
diff --git a/docs/source/how_to/authentication.md b/docs/source/how_to/authentication.md
new file mode 100644
index 0000000..5f3bc48
--- /dev/null
+++ b/docs/source/how_to/authentication.md
@@ -0,0 +1,57 @@
+# Authentication
+
+The Authentication framework for auto-archiver allows you to add login details for various websites in a flexible way, directly from the configuration file.
+
+There are two main use cases for authentication:
+* Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
+* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this.
+
+## The Authentication Config
+
+You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same.
+
+```{code} yaml
+authentication:
+   # optional file to load authentication information from, for security or multi-system deploy purposes
+   load_from_file: path/to/authentication/file.txt
+   # optional setting to load cookies from the named browser on the system.
+   cookies_from_browser: firefox
+   # optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these
+   cookies_file: path/to/cookies.jar
+
+   twitter.com,x.com:
+      username: myusername
+      password: 123
+    
+    facebook.com:
+       cookie: single_cookie
+
+    othersite.com:
+       api_key: 123
+       api_secret: 1234
+
+# All available options:
+  # - username: str - the username to use for login
+  # - password: str - the password to use for login
+  # - api_key: str - the API key to use for login
+  # - api_secret: str - the API secret to use for login
+  # - cookie: str - a cookie string to use for login (specific to this site)
+```
+
+### Recommendations for authentication
+
+1. **Store authentication information separately:**
+The authentication part of your configuration contains sensitive information. You should make efforts not to share this with others. For extra security, use the `load_from_file` option to keep your authentication settings out of your configuration file, ideally in a different folder.
+
+2. **Don't use your own personal credentials**
+Depending on the website you are extracting information from, there may be rules (Terms of Service) that prohibit you from scraping or extracting information using a bot. If you use your own personal account, there's a possibility it might get blocked/disabled. It's recommended to set up a separate, 'throwaway' account. In that way, if it gets blocked you can easily create another one to continue your archiving.
+
+
+### How to create a cookies.jar or pass cookies directly to auto-archiver
+
+auto-archiver uses yt-dlp's powerful cookies features under the hood. For instructions on how to extract a cookies.jar (or cookies.txt) file directly from your browser, see the FAQ in the [yt-dlp documentation](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp)
+
+```{note} For developers:
+
+For information on how to access and use authentication settings from within your module, see the `{generic_extractor}` for an example, or view the [`auth_for_site()` function in BaseModule](../autoapi/core/base_module/index.rst)
+```
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index cd76b59..9823833 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [project]
 name = "auto-archiver"
-version = "0.13.2"
+version = "0.13.3"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
 
 requires-python = ">=3.10,<3.13"
diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py
index 0023a59..f901d21 100644
--- a/src/auto_archiver/__main__.py
+++ b/src/auto_archiver/__main__.py
@@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
 import sys
 
 def main():
-    ArchivingOrchestrator().run(sys.argv[1:])
+    for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass
 
 if __name__ == "__main__":
     main()
diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py
index ece4719..dfdd5ad 100644
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -63,12 +63,6 @@ class BaseModule(ABC):
     def config_setup(self, config: dict):
 
         authentication = config.get('authentication', {})
-        # extract out concatenated sites
-        for key, val in copy(authentication).items():
-            if "," in key:
-                for site in key.split(","):
-                    authentication[site] = val
-                del authentication[key]
 
         # this is important. Each instance is given its own deepcopied config, so modules cannot
         # change values to affect other modules
@@ -89,16 +83,21 @@ class BaseModule(ABC):
         Returns the authentication information for a given site. This is used to authenticate
         with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
         
-        extract_cookies: bool - whether or not to extract cookies from the given browser and return the 
-        cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
+        :param site: the domain of the site to get authentication information for
+        :param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
 
-        Currently, the dict can have keys of the following types:
-        - username: str - the username to use for login
-        - password: str - the password to use for login
-        - api_key: str - the API key to use for login
-        - api_secret: str - the API secret to use for login
-        - cookie: str - a cookie string to use for login (specific to this site)
-        - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
+        :returns: authdict dict of login information for the given site
+
+        **Global options:**\n
+        * cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
+        * cookies_file: str - the path to a cookies file to use for login\n
+
+        **Currently, the sites dict can have keys of the following types:**\n
+        * username: str - the username to use for login\n
+        * password: str - the password to use for login\n
+        * api_key: str - the API key to use for login\n
+        * api_secret: str - the API secret to use for login\n
+        * cookie: str - a cookie string to use for login (specific to this site)\n
         """
         # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
         # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py
index c2d38ee..322ef6e 100644
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -129,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
                 yaml_subdict[key] = value
                 continue
 
+            if key == 'steps':
+                for module_type, modules in value.items():
+                    # overwrite the 'steps' from the config file with the ones from the CLI
+                    yaml_subdict[key][module_type] = modules
+
             if is_dict_type(value):
                 update_dict(value, yaml_subdict[key])
             elif is_list_type(value):
@@ -137,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
                 yaml_subdict[key] = value
 
     update_dict(from_dot_notation(dotdict), yaml_dict)
-
     return yaml_dict
 
 def read_yaml(yaml_filename: str) -> CommentedMap:
@@ -159,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
 def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
     config_to_save = deepcopy(config)
 
+    auth_dict = config_to_save.get("authentication", {})
+    if auth_dict and auth_dict.get('load_from_file'):
+        # remove all other values from the config, don't want to store it in the config file
+        auth_dict = {"load_from_file": auth_dict["load_from_file"]}
+
     config_to_save.pop('urls', None)
     with open(yaml_filename, "w", encoding="utf-8") as outf:
         _yaml.dump(config_to_save, outf)
\ No newline at end of file
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index 9dd3e06..208512a 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -8,6 +8,7 @@ from __future__ import annotations
 from typing import Generator, Union, List, Type
 from urllib.parse import urlparse
 from ipaddress import ip_address
+from copy import copy
 import argparse
 import os
 import sys
@@ -43,30 +44,50 @@ class AuthenticationJsonParseAction(JsonParseAction):
     def __call__(self, parser, namespace, values, option_string=None):
         super().__call__(parser, namespace, values, option_string)
         auth_dict = getattr(namespace, self.dest)
-        if isinstance(auth_dict, str):
-            # if it's a string
+
+        def load_from_file(path):
             try:
-                with open(auth_dict, 'r') as f:
+                with open(path, 'r') as f:
                     try:
                         auth_dict = json.load(f)
                     except json.JSONDecodeError:
+                        f.seek(0)
                         # maybe it's yaml, try that
                         auth_dict = _yaml.load(f)
+                    if auth_dict.get('authentication'):
+                        auth_dict = auth_dict['authentication']
+                    auth_dict['load_from_file']  = path
+                    return auth_dict
             except:
-                pass
+                return None
 
+        if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
+            auth_dict = load_from_file(auth_dict['from_file'])
+        elif isinstance(auth_dict, str):
+            # if it's a string
+            auth_dict = load_from_file(auth_dict)
+        
         if not isinstance(auth_dict, dict):
             raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
-        for site, auth in auth_dict.items():
-            if not isinstance(site, str) or not isinstance(auth, dict):
-                raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
+        for key, auth in auth_dict.items():
+            if key in global_options:
+                continue
+            if not isinstance(key, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
+        
+        # extract out concatenated sites
+        for key, val in copy(auth_dict).items():
+            if "," in key:
+                for site in key.split(","):
+                    auth_dict[site] = val
+                del auth_dict[key]
+
         setattr(namespace, self.dest, auth_dict)
 
 
 class UniqueAppendAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
-        if not hasattr(namespace, self.dest):
-            setattr(namespace, self.dest, [])
         for value in values:
             if value not in getattr(namespace, self.dest):
                 getattr(namespace, self.dest).append(value)
@@ -104,36 +125,50 @@ class ArchivingOrchestrator:
         return parser
 
     def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
+
+
+        # modules parser to get the overridden 'steps' values
+        modules_parser = argparse.ArgumentParser(
+            add_help=False,
+        )
+        self.add_modules_args(modules_parser)
+        cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
+        for module_type in BaseModule.MODULE_TYPES:
+            yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
+
         parser = DefaultValidatingParser(
             add_help=False,
         )
         self.add_additional_args(parser)
 
+        # merge command line module args (--feeders, --enrichers etc.) and add them to the config
+
         # check what mode we're in
         # if we have a config file, use that to decide which modules to load
         # if simple, we'll load just the modules that has requires_setup = False
         # if full, we'll load all modules
         # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
         # but should we add them? Or should we just add them to the 'complete' parser?
+
         if yaml_config != EMPTY_CONFIG:
             # only load the modules enabled in config
             # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
             enabled_modules = []
             # first loads the modules from the config file, then from the command line
-            for config in [yaml_config['steps'], basic_config.__dict__]:
-                for module_type in BaseModule.MODULE_TYPES:
-                    enabled_modules.extend(config.get(f"{module_type}s", []))
+            for module_type in BaseModule.MODULE_TYPES:
+                enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
 
             # clear out duplicates, but keep the order
             enabled_modules = list(dict.fromkeys(enabled_modules))
             avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
-            self.add_module_args(avail_modules, parser)
+            self.add_individual_module_args(avail_modules, parser)
         elif basic_config.mode == 'simple':
             simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
-            self.add_module_args(simple_modules, parser)
+            self.add_individual_module_args(simple_modules, parser)
 
             # for simple mode, we use the cli_feeder and any modules that don't require setup
-            yaml_config['steps']['feeders'] = ['cli_feeder']
+            if not yaml_config['steps']['feeders']:
+                yaml_config['steps']['feeders'] = ['cli_feeder']
 
             # add them to the config
             for module in simple_modules:
@@ -141,30 +176,38 @@ class ArchivingOrchestrator:
                     yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
         else:
             # load all modules, they're not using the 'simple' mode
-            self.add_module_args(available_modules(with_manifest=True), parser)
-
+            self.add_individual_module_args(available_modules(with_manifest=True), parser)
+        
         parser.set_defaults(**to_dot_notation(yaml_config))
 
         # reload the parser with the new arguments, now that we have them
         parsed, unknown = parser.parse_known_args(unused_args)
-
         # merge the new config with the old one
-        self.config = merge_dicts(vars(parsed), yaml_config)
+        config = merge_dicts(vars(parsed), yaml_config)
+
         # clean out args from the base_parser that we don't want in the config
         for key in vars(basic_config):
-            self.config.pop(key, None)
+            config.pop(key, None)
 
         # setup the logging
-        self.setup_logging()
+        self.setup_logging(config)
 
         if unknown:
             logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
 
-        if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
+        if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
             logger.info(f"Storing configuration file to {basic_config.config_file}")
-            store_yaml(self.config, basic_config.config_file)
+            store_yaml(config, basic_config.config_file)
 
-        return self.config
+        return config
+    
+    def add_modules_args(self, parser: argparse.ArgumentParser = None):
+        if not parser:
+            parser = self.parser
+
+        # Module loading from the command line
+        for module_type in BaseModule.MODULE_TYPES:
+            parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
 
     def add_additional_args(self, parser: argparse.ArgumentParser = None):
         if not parser:
@@ -173,30 +216,24 @@ class ArchivingOrchestrator:
         # allow passing URLs directly on the command line
         parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
 
-        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
-        parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
-        parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
-        parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
-        parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
-        parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
-
         parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
                                                                             (token, username etc.) that extractors can use to log into \
                                                                             a website. If passing this on the command line, use a JSON string. \
                                                                             You may also pass a path to a valid JSON/YAML file which will be parsed.',
                                                                             default={},
+                                                                            nargs="?",
                                                                             action=AuthenticationJsonParseAction)
+
         # logging arguments
         parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
         parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
         parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
 
-    def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
+    def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
 
         if not modules:
             modules = available_modules(with_manifest=True)
-
-        module: LazyBaseModule
+        
         for module in modules:
 
             if not module.configs:
@@ -226,18 +263,19 @@ class ArchivingOrchestrator:
                 arg.should_store = should_store
 
     def show_help(self, basic_config: dict):
-        # for the help message, we want to load *all* possible modules and show the help
+        # for the help message, we want to load manifests from *all* possible modules and show their help/settings
         # add configs as arg parser arguments
 
+        self.add_modules_args(self.basic_parser)
         self.add_additional_args(self.basic_parser)
-        self.add_module_args(parser=self.basic_parser)
+        self.add_individual_module_args(parser=self.basic_parser)
         self.basic_parser.print_help()
         self.basic_parser.exit()
 
-    def setup_logging(self):
+    def setup_logging(self, config):
         # setup loguru logging
         logger.remove(0)  # remove the default logger
-        logging_config = self.config['logging']
+        logging_config = config['logging']
         logger.add(sys.stderr, level=logging_config['level'])
         if log_file := logging_config['file']:
             logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
@@ -318,9 +356,9 @@ class ArchivingOrchestrator:
 
         return read_yaml(config_file)
     
-    def setup(self, args: list):
+    def setup_config(self, args: list) -> dict:
         """
-        Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
+        Sets up the configuration file, merging the default config with the user's config
         """
         self.setup_basic_parser()
 
@@ -333,9 +371,16 @@ class ArchivingOrchestrator:
         # if help flag was called, then show the help
         if basic_config.help:
             self.show_help(basic_config)
-
+        # merge command line --feeder etc. args with what's in the yaml config
         yaml_config = self.load_config(basic_config.config_file)
-        self.setup_complete_parser(basic_config, yaml_config, unused_args)
+
+        return self.setup_complete_parser(basic_config, yaml_config, unused_args)
+
+    def setup(self, args: list):
+        """
+        Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
+        """
+        self.config = self.setup_config(args)
 
         logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
         self.install_modules(self.config['steps'])
@@ -344,8 +389,18 @@ class ArchivingOrchestrator:
         for module_type in BaseModule.MODULE_TYPES:
             logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
 
-    def run(self, args: list) -> Generator[Metadata]:
+    def _command_line_run(self, args: list) -> Generator[Metadata]:
+        """
+        This is the main entry point for the orchestrator, when run from the command line.
 
+        :param args: list of arguments to pass to the orchestrator - these are the command line args
+        
+        You should not call this method from code implementations.
+          
+        This method sets up the configuration, loads the modules, and runs the feed.
+        If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
+        To test configurations, without loading any modules you can also first call 'setup_configs'
+        """
         self.setup(args)
         return self.feed()
 
diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py
index 7b74072..77026ea 100644
--- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py
+++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py
@@ -10,7 +10,7 @@
         "sheet": {"default": None, "help": "name of the sheet to archive"},
         "sheet_id": {
             "default": None,
-            "help": "(alternative to sheet name) the id of the sheet to archive",
+            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
         },
         "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
         "service_account": {
diff --git a/src/auto_archiver/modules/html_formatter/templates/html_template.html b/src/auto_archiver/modules/html_formatter/templates/html_template.html
index 8bdf5ef..62d6b0b 100644
--- a/src/auto_archiver/modules/html_formatter/templates/html_template.html
+++ b/src/auto_archiver/modules/html_formatter/templates/html_template.html
@@ -200,7 +200,7 @@
                 el.innerHTML = decodeCertificate(certificate);
 
                 let cyberChefUrl =
-                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
+                    `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
                 // create a new anchor with this url and append after the code
                 let a = document.createElement("a");
                 a.href = cyberChefUrl;
diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py
index 831959e..9829844 100644
--- a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py
@@ -4,7 +4,6 @@
     "requires_setup": True,
     "dependencies": {
         "python": ["loguru", "selenium"],
-        "bin": ["geckodriver"]
     },
     "configs": {
             "width": {"default": 1280, "help": "width of the screenshots"},
diff --git a/tests/data/test_service_account.json b/tests/data/test_service_account.json
new file mode 100644
index 0000000..5aae894
--- /dev/null
+++ b/tests/data/test_service_account.json
@@ -0,0 +1,14 @@
+{
+    "type": "service_account",
+    "project_id": "some-project-id",
+    "private_key_id": "some-private-key-id",
+    "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDPlcaFJgt7HzoC\n4z0b18PzI2R5c892mLnNwRO8DOKid5INt6z5RAWKDPdnIyHjRBx74qNZl6768pia\nztQNgnud7mKcmvOvGrpUbFx2BdAw8xTyAlRVMalOBhUS9RKvjP5WgSwR5EKwfvzy\nrGioC6ml/segz5EchSaIzgASwB17ir0w6IrymBxUeNelfzCGJpCRhqG5nG+eEjct\nUYU0QIyihRD1Lq0f3Z3D0xfTLLZ630iFBj/Wr0BCJHkl6hdVuGhnyn4S98sMX1Bd\ntaJF/lWi4jdt7SoXD3+FWv66kHPpFfINMpReuB9u0ogfYkORgiRBOMhYBkGGQjUG\nOnBTxEc3AgMBAAECgf9bKiK8DdSz0ALzQbRLhgj2B9485jHI49wjgINOyceZ23uS\nQYXaO+DFLcgLqBkVSGanuHMpU0+qCpeM0v9yXSTIW8RguWMnFd8ID/yLRktxfQa1\n1FAQh+NlF4/gnuUoM8N/FYSy6R5grfaxwU8Qfg66IQXUB52OezSVu5lxNO4G5Rwv\nJ2e/+XYBUv/H26BnQSmjFCzbJkdbtrOeThpaLwLexKcollvoHKGyus0jpWg4C9Ez\n9EJaE+on4nd+cM1Vd+dWaHXoZ9Db9IvxPBqFJE8fynap7RDBeZK678OuCvQntrp4\nrTsE9hW8073Jhl/LbhfbDC0lhFR0JUHygVGE01ECgYEA+g+ddpGGY90yhhM76bTr\nkU6WwislMmfS0WDdLPemNgzLwCtkC2vsQgzg/egxqkVF5dJ9upiFhVgpYxY7ap9U\nSGFemb6T1ASl/1yeNhd0yc4PZFsJ29k+kNgSIlJYm9KDCIMqS1wPoXvFQhbMitOf\n/gLCPugxl67c+qg6nfuODTkCgYEA1IPngESOJnV8oa2WReWrO6+u6xb/OhqdmBzI\n5yq1z3f5gb98XESZR/rCH2vAOmHIJPn3XdZHsznOuxhZwGr1oztiRIurLmBlxQoL\n7tq0jDOUVSD2yeyQwKt5LaBH94P598FiauGxXM4raREWKtcNBGoOX1u1+kEBsoL4\ntf10Z+8CgYEA3QFkB+ECR8y91KW3NAzEjj5JG/8J9wyv1IGpuQ5/hhG1Gni/CSEv\nRAkh6QaIrpZe+ooYuQwIJhwPKBYEGW4MDZSRCYzYFnCtTY5L/j6o55sJG4cipX3R\nwC5XiKIC0mUxjhpvDP+miPBdHNYNnT0AkH1btEF/YzIW+Coq9GnZ2HECgYAOOpax\ne+WYpZ0mphy9qVcBtA2eJ/gGx+ltWeAJuk5aCcpm6Y9GDkHFFAETYX+JaSqhbysk\n2UgLs/8nf8XioEa6GyvFMyTPAh1OSBHseDBGgt2XpZFgi7pVbCW87FJlPCzsbcJN\nLbdWY2d8rWwyihuRBBjaQaW5j8ixTxuf88xreQKBgQCST4Fr8C5CkpakTA+KOost\nLOlziUBm0534mTg7dTcOE1H1+gxtqpXlXcJylpGz1lUXRlHCIutN5iPJcN5cxFES\nsP7wBd7BhficsMKDiWPm9XbP2zXVZu0ldUxA1mONMsS1P4p7i3Dh4uzrRDmSkTUL\njUpppYDumg3oM7wSJ6sTQA==\n-----END PRIVATE KEY-----",
+    "client_email": "some-email",
+    "client_id": "some-client-email",
+    "auth_uri": "https://example.com/o/oauth2/auth",
+    "token_uri": "https://oauth2.example.com/token",
+    "auth_provider_x509_cert_url": "https://www.example.com/oauth2/v1/certs",
+    "client_x509_cert_url": "https://www.example.com/robot/v1/metadata/x509/some-email",
+    "universe_domain": "example.com"
+  }
+  
\ No newline at end of file
diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py
index c70a51f..54f4d9c 100644
--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@@ -9,6 +9,7 @@ import pytest
 from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
 from .test_extractor_base import TestExtractorBase
 
+CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
 class TestGenericExtractor(TestExtractorBase):
     """Tests Generic Extractor
     """
@@ -77,10 +78,11 @@ class TestGenericExtractor(TestExtractorBase):
         result = self.extractor.download(item)
         assert not result
 
-
+    @pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
     @pytest.mark.download
     def test_youtube_download(self, make_item):
         # url https://www.youtube.com/watch?v=5qap5aO4i9A
+
         item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
         result = self.extractor.download(item)
         assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
@@ -114,6 +116,7 @@ class TestGenericExtractor(TestExtractorBase):
         result = self.extractor.download(item)
         assert result is not False
     
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
     @pytest.mark.download
     def test_truthsocial_download_video(self, make_item):
         item = make_item("https://truthsocial.com/@DaynaTrueman/posts/110602446619561579")
@@ -121,18 +124,21 @@ class TestGenericExtractor(TestExtractorBase):
         assert len(result.media) == 1
         assert result is not False
 
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
     @pytest.mark.download
     def test_truthsocial_download_no_media(self, make_item):
         item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
         result = self.extractor.download(item)
         assert result is not False
     
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
     @pytest.mark.download
     def test_truthsocial_download_poll(self, make_item):
         item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
         result = self.extractor.download(item)
         assert result is not False
     
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
     @pytest.mark.download
     def test_truthsocial_download_single_image(self, make_item):
         item = make_item("https://truthsocial.com/@mariabartiromo/posts/113861116433335006")
@@ -140,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
         assert len(result.media) == 1
         assert result is not False
 
+    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
     @pytest.mark.download
     def test_truthsocial_download_multiple_images(self, make_item):
         item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")
diff --git a/tests/extractors/test_twitter_api_extractor.py b/tests/extractors/test_twitter_api_extractor.py
index 004376c..26394ac 100644
--- a/tests/extractors/test_twitter_api_extractor.py
+++ b/tests/extractors/test_twitter_api_extractor.py
@@ -34,7 +34,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
 
     @pytest.mark.download
     def test_sanitize_url_download(self):
-        assert "https://t.co/yl3oOJatFp" == self.extractor.sanitize_url("https://www.bellingcat.com/category/resources/")
+        assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
 
     @pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
         ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
diff --git a/tests/test_implementation.py b/tests/test_implementation.py
index 7e33651..85fc448 100644
--- a/tests/test_implementation.py
+++ b/tests/test_implementation.py
@@ -60,3 +60,15 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
 
     # should treat an empty file as if there is no file at all
     assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
+
+def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
+    from auto_archiver.__main__ import main
+
+    # monkey patch to change the current working directory, so that we don't use the user's real config file
+    monkeypatch.chdir(tmp_path)
+    with monkeypatch.context() as m:
+        m.setattr(sys, "argv", ["auto-archiver"])
+        with pytest.raises(SystemExit):
+            main()
+
+    assert "No URLs provided. Please provide at least one" in caplog.text
\ No newline at end of file
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index 5ba57d0..f93f8b8 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -75,18 +75,36 @@ def test_help(orchestrator, basic_parser, capsys):
         orchestrator.show_help(args)
 
     assert exit_error.value.code == 0
-    assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in capsys.readouterr().out
+
+    logs = capsys.readouterr().out
+    assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in logs
+
+    # basic config options
+    assert "--version" in logs
+
+    # setting modules options
+    assert "--feeders" in logs
+    assert "--extractors" in logs
+
+    # authentication options
+    assert "--authentication" in logs
+
+    # logging options
+    assert "--logging.level" in logs
+
+    # individual module configs
+    assert "--gsheet_feeder.sheet_id" in logs
 
 
 def test_add_custom_modules_path(orchestrator, test_args):
-    orchestrator.run(test_args)
+    orchestrator.setup_config(test_args)
     
     import auto_archiver
     assert "tests/data/test_modules/" in auto_archiver.modules.__path__
 
 def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
 
-    orchestrator.run(test_args +  # we still need to load the real path to get the example_module 
+    orchestrator.setup_config(test_args +  # we still need to load the real path to get the example_module 
                           ["--module_paths", "tests/data/invalid_test_modules/"])
 
     assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
@@ -97,7 +115,7 @@ def test_check_required_values(orchestrator, caplog, test_args):
     test_args = test_args[:-2]
 
     with pytest.raises(SystemExit) as exit_error:
-        orchestrator.run(test_args)
+        config = orchestrator.setup_config(test_args)
 
     assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
 
@@ -111,24 +129,50 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
     store_yaml(test_yaml, tmp_file)
 
     # run the orchestrator
-    orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES])
-    assert orchestrator.config is not None
+    config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
+    assert config is not None
 
 def test_load_authentication_string(orchestrator, test_args):
 
-    orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
-    assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
+    config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
+    assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
 
 def test_load_authentication_string_concat_site(orchestrator, test_args):
     
-    orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
-    assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"},
+    config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
+    assert config['authentication'] == {"x.com": {"api_key": "my_key"},
                                                      "twitter.com": {"api_key": "my_key"}}
 
 def test_load_invalid_authentication_string(orchestrator, test_args):
     with pytest.raises(ArgumentTypeError):
-        orchestrator.run(test_args + ["--authentication", "{\''invalid_json"])
+        orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])
 
 def test_load_authentication_invalid_dict(orchestrator, test_args):
     with pytest.raises(ArgumentTypeError):
-        orchestrator.run(test_args + ["--authentication", "[true, false]"])
\ No newline at end of file
+        orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])
+
+def test_load_modules_from_commandline(orchestrator, test_args):
+    args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
+
+    orchestrator.setup(args)
+
+    assert len(orchestrator.feeders) == 1
+    assert len(orchestrator.extractors) == 1
+    assert len(orchestrator.databases) == 1
+    assert len(orchestrator.enrichers) == 1
+    assert len(orchestrator.formatters) == 1
+
+    assert orchestrator.feeders[0].name == "example_module"
+    assert orchestrator.extractors[0].name == "example_module"
+    assert orchestrator.databases[0].name == "example_module"
+    assert orchestrator.enrichers[0].name == "example_module"
+    assert orchestrator.formatters[0].name == "example_module"
+
+def test_load_settings_for_module_from_commandline(orchestrator, test_args):
+    args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
+
+    orchestrator.setup(args)
+
+    assert len(orchestrator.feeders) == 1
+    assert orchestrator.feeders[0].name == "gsheet_feeder"
+    assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
\ No newline at end of file