Merge branch 'main' into timestamping_rewrite

2025-02-25 17:10:55 +00:00 · 2025-02-25 17:10:55 +00:00 · 4dcb77c29f
commit 4dcb77c29f
--- a/README.md
+++ b/README.md
@ -23,11 +23,13 @@ Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.

 ## Installation

-View the [Installation Guide](installation/installation.md) for full instructions
+View the [Installation Guide](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html) for full instructions
+
+**Advanced:**

 To get started quickly using Docker:

-`docker pull bellingcat/auto-archiver && docker run`
+`docker pull bellingcat/auto-archiver && docker run --rm -v secrets:/app/secrets bellingcat/auto-archiver --config secrets/orchestration.yaml`

 Or pip:

--- a/docs/scripts/scripts.py
+++ b/docs/scripts/scripts.py
@ -3,6 +3,7 @@ from pathlib import Path
 from auto_archiver.core.module import ModuleFactory
 from auto_archiver.core.base_module import BaseModule
 from ruamel.yaml import YAML
+from ruamel.yaml.comments import CommentedMap
 import io

 MODULES_FOLDER = Path(__file__).parent.parent.parent.parent / "src" / "auto_archiver" / "modules"
@ -30,6 +31,7 @@ steps:
 ...

 {config_string}
+
 """

 def generate_module_docs():
@ -38,8 +40,9 @@ def generate_module_docs():
    modules_by_type = {}

    header_row = "| " + " | ".join(TABLE_HEADER) + "|\n" + "| --- " * len(TABLE_HEADER) + "|\n"
-    configs_cheatsheet = "\n## Configuration Options\n"
-    configs_cheatsheet += header_row
+    global_table = "\n## Configuration Options\n" + header_row
+
+    global_yaml = yaml.load("""\n# Module configuration\nplaceholder: {}""")

    for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
        # generate the markdown file from the __manifest__.py file.
@ -66,19 +69,30 @@ def generate_module_docs():

            config_table = header_row
            config_yaml = {}
+
+            global_yaml[module.name] = CommentedMap()
+            global_yaml.yaml_set_comment_before_after_key(module.name, f"\n\n{module.display_name} configuration options")
+
+
            for key, value in manifest['configs'].items():
                type = value.get('type', 'string')
-                if type == 'auto_archiver.utils.json_loader':
+                if type == 'json_loader':
                    value['type'] = 'json'
                elif type == 'str':
                    type = "string"
                
                default = value.get('default', '')
                config_yaml[key] = default
+
+                global_yaml[module.name][key] = default
+
+                if value.get('help', ''):
+                    global_yaml[module.name].yaml_add_eol_comment(value.get('help', ''), key)
+
                help = "**Required**. " if value.get('required', False) else "Optional. "
                help += value.get('help', '')
                config_table += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n"
-                configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
+                global_table += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
            readme_str += "\n## Configuration Options\n"
            readme_str += "\n### YAML\n"

@ -103,8 +117,13 @@ def generate_module_docs():
                f.write(readme_str)
        generate_index(modules_by_type)

+    del global_yaml['placeholder']
+    global_string = io.BytesIO()
+    global_yaml = yaml.dump(global_yaml, global_string)
+    global_string = global_string.getvalue().decode('utf-8')
+    global_yaml = f"```yaml\n{global_string}\n```"
    with open(SAVE_FOLDER / "configs_cheatsheet.md", "w") as f:
-        f.write(configs_cheatsheet)
+        f.write("### Configuration File\n" + global_yaml + "\n### Command Line\n" + global_table)


 def generate_index(modules_by_type):
--- a/docs/source/bc.png
+++ b/docs/source/bc.png
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -3,9 +3,11 @@
 import sys
 import os
 from importlib.metadata import metadata
+from datetime import datetime

 sys.path.append(os.path.abspath('../scripts'))
 from scripts import generate_module_docs
+from auto_archiver.version import __version__

 # -- Project Hooks -----------------------------------------------------------
 # convert the module __manifest__.py files into markdown files
@ -15,7 +17,8 @@ generate_module_docs()
 # -- Project information -----------------------------------------------------
 package_metadata = metadata("auto-archiver")
 project = package_metadata["name"]
-authors = "Bellingcat"
+copyright = str(datetime.now().year)
+author = "Bellingcat"
 release = package_metadata["version"]
 language = 'en'

@ -32,7 +35,7 @@ extensions = [
 ]

 templates_path = ['_templates']
-exclude_patterns = []
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ""]


 # -- AutoAPI Configuration ---------------------------------------------------
@ -76,6 +79,14 @@ source_suffix = {
 html_theme = 'sphinx_book_theme'
 html_static_path = ["../_static"]
 html_css_files = ["custom.css"]
+html_title = f"Auto Archiver v{__version__}"
+html_logo = "bc.png"
+html_theme_options = {
+    "repository_url": "https://github.com/bellingcat/auto-archiver",
+    "use_repository_button": True,
+}
+
+

 copybutton_prompt_text = r">>> |\.\.\."
 copybutton_prompt_is_regexp = True
--- a/docs/source/core_modules.md
+++ b/docs/source/core_modules.md
@ -1,8 +1,8 @@
 # Module Documentation

-These pages describe the core modules that come with `auto-archiver` and provide the main functionality for archiving websites on the internet. There are five core module types:
+These pages describe the core modules that come with Auto Archiver and provide the main functionality for archiving websites on the internet. There are five core module types:

-1. Feeders - these 'feed' information (the URLs) from various sources to the `auto-archiver` for processing
+1. Feeders - these 'feed' information (the URLs) from various sources to the Auto Archiver for processing
 2. Extractors - these 'extract' the page data for a given URL that is fed in by a feeder
 3. Enrichers - these 'enrich' the data extracted in the previous step with additional information
 4. Storage - these 'store' the data in a persistent location (on disk, Google Drive etc.)
--- a/docs/source/development/creating_modules.md
+++ b/docs/source/development/creating_modules.md
@ -1,6 +1,6 @@
 # Creating Your Own Modules

-Modules are what's used to extend `auto-archiver` to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [Core Modules](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include:
+Modules are what's used to extend Auto Archiver to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [Core Modules](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include:

 1. Extracting data from a website which doesn't work with the current core extractors.
 2. Enriching or altering the data before saving with additional information that the core enrichers do not offer.
@ -21,7 +21,7 @@ When done, you should have a module structure as follows:
 │   └── awesome_extractor.py
 ``` 

-Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the `auto-archiver` repository for examples of the folder structure for real-world modules.
+Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the Auto Archiver repository for examples of the folder structure for real-world modules.

 ## Populating the Manifest File

--- a/docs/source/development/release.md
+++ b/docs/source/development/release.md
@ -6,7 +6,7 @@
 1. Update the version number in [version.py](src/auto_archiver/version.py)
 2. Go to github releases > new release > use `vx.y.z` for matching version notation
   1. package is automatically updated in pypi
-   2. docker image is automatically pushed to dockerhup
+   2. docker image is automatically pushed to dockerhub



--- a/docs/source/how_to.md
+++ b/docs/source/how_to.md
@ -1,49 +1,6 @@
 # How-To Guides

-## How to use Google Sheets to load and store archive information
-The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. 
-This sheet must have been shared with the Google Service account used by `gspread`. 
-This sheet must also have specific columns (case-insensitive) in the `header` - see the [Gsheet Feeder Docs](modules/autogen/feeder/gsheet_feeder.md) for more info. The default names of these columns and their purpose is:
-
-Inputs:
-
-* **Link** *(required)*: the URL of the post to archive
-* **Destination folder**: custom folder for archived file (regardless of storage)
-
-Outputs:
-* **Archive status** *(required)*: Status of archive operation
-* **Archive location**: URL of archived post
-* **Archive date**: Date archived
-* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet
-* **Timestamp**: Timestamp of original post
-* **Title**: Post title
-* **Text**: Post text
-* **Screenshot**: Link to screenshot of post
-* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
-* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
-* **WACZ**: Link to a WACZ web archive of post
-* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
-
-For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.)
-
-![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png)
-
-Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation.
-
-When the auto archiver starts running, it updates the "Archive status" column.
-
-![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png)
-
-The links are downloaded and archived, and the spreadsheet is updated to the following:
-
-![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png)
-
-Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
-
-The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive.
-
-![The archive result for a link in the demo sheet.](../demo-archive.png)
-
+The follow pages contain helpful how-to guides for common use cases of the Auto Archiver.
 ---

 ```{toctree}
@ -51,4 +8,5 @@ The "archive location" link contains the path of the archived file, in local sto
 :glob:

 how_to/*
+
 ```
--- a/docs/source/how_to/authentication_how_to.md
+++ b/docs/source/how_to/authentication_how_to.md
@ -0,0 +1,110 @@
+# Logging in to sites
+
+This how-to guide shows you how you can use various authentication methods to allow you to login to a site you are trying to archive. This is useful for websites that require a user to be logged in to browse them, or for sites that restrict bots.
+
+In this How-To, we will authenticate on use Twitter/X.com using cookies, and on XXXX using username/password.
+
+
+
+## Using cookies to authenticate on Twitter/X
+
+It can be useful to archive tweets after logging in, since some tweets are only visible to authenticated users. One case is Tweets marked as 'Sensitive'.
+
+Take this tweet as an example: [https://x.com/SozinhoRamalho/status/1876710769913450647](https://x.com/SozinhoRamalho/status/1876710769913450647)
+
+This tweet has been marked as sensitive, so a normal run of Auto Archiver without a logged in session will fail to extract the tweet:
+
+```{code-block} console
+:emphasize-lines: 3,4,5,6
+
+>>> auto-archiver https://x.com/SozinhoRamalho/status/1876710769913450647                                                                                     ✭ ✱
+ ...
+ERROR: [twitter] 1876710769913450647: NSFW tweet requires authentication. Use --cookies, 
+--cookies-from-browser, --username and --password, --netrc-cmd, or --netrc (twitter) to
+ provide account credentials. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp 
+ for how to manually pass cookies
+[twitter] 1876710769913450647: Downloading guest token
+[twitter] 1876710769913450647: Downloading GraphQL JSON
+2025-02-20 15:06:13.362 | ERROR    | auto_archiver.modules.generic_extractor.generic_extractor:download_for_extractor:248 - Error downloading metadata for post: NSFW tweet requires authentication. Use --cookies, --cookies-from-browser, --username and --password, --netrc-cmd, or --netrc (twitter) to provide account credentials. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies
+[generic] Extracting URL: https://x.com/SozinhoRamalho/status/1876710769913450647
+[generic] 1876710769913450647: Downloading webpage
+WARNING: [generic] Falling back on generic information extractor
+[generic] 1876710769913450647: Extracting information
+ERROR: Unsupported URL: https://x.com/SozinhoRamalho/status/1876710769913450647
+2025-02-20 15:06:13.744 | INFO     | auto_archiver.core.orchestrator:archive:483 - Trying extractor telegram_extractor for https://x.com/SozinhoRamalho/status/1876710769913450647
+2025-02-20 15:06:13.744 | SUCCESS  | auto_archiver.modules.console_db.console_db:done:23 - DONE Metadata(status='nothing archived', metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 6, 12, 473979, tzinfo=datetime.timezone.utc), 'url': 'https://x.com/SozinhoRamalho/status/1876710769913450647'}, media=[])
+...
+```
+
+To get round this limitation, we can use **cookies** (information about a logged in user) to mimic being logged in to Twitter. There are two ways to pass cookies to Auto Archiver. One is from a file, and the other is from a browser profile on your computer.
+
+In this tutorial, we will export the Twitter cookies from our browser and add them to Auto Archiver
+
+**1. Installing a cookie exporter extension**
+
+First, we need to install an extension in our browser to export the cookies for a certain site. The [FAQ on yt-dlp](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp) provides some suggestions: Get [cookies.txt LOCALLY](https://chromewebstore.google.com/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) for Chrome or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) for Firefox.
+
+**2. Export the cookies**
+
+```{note} See the note [here](../installation/authentication.md#recommendations-for-authentication) on why you shouldn't use your own personal account for achiving.
+```
+
+Once the extension is installed in your preferred browser, login to Twitter in this browser, and then activate the extension and export the cookies. You can choose to export all your cookies for your browser, or just cookies for this specific site. In the image below, we're only exporting cookies for Twitter/x.com:
+
+![extract cookies](extract_cookies.png)
+
+
+**3. Adding the cookies file to Auto Archiver**
+
+You now will have a file called `cookies.txt` (tip: name it `twitter_cookies.txt` if you only exported cookies for Twitter), which needs to be added to Auto Archiver.
+
+Do this by going into your Auto Archiver configuration file, and editing the `authentication` section. We will add the `cookies_file` option for the site `x.com,twitter.com`.
+
+```{note} For websites that have multiple URLs (like x.com and twitter.com) you can 'reuse' the same login information without duplicating it using a comma separated list of domain names.
+```
+
+I've saved my `twitter_cookies.txt` file in a `secrets` folder, so here's how my authentication section looks now:
+
+```{code} yaml
+:caption: orchestration.yaml
+
+...
+
+authentication:
+   x.com,twitter.com:
+      cookies_file: secrets/twitter_cookies.txt
+...
+```
+
+**4. Re-run your archiving with the cookies enabled**
+
+Now, the next time we re-run Auto Archiver, the cookies from our logged-in session will be used by Auto Archiver, and restricted/sensitive tweets can be downloaded!
+
+```{code} console
+>>> auto-archiver https://x.com/SozinhoRamalho/status/1876710769913450647                                                                                   ✭ ✱ ◼
+...
+2025-02-20 15:27:46.785 | WARNING  | auto_archiver.modules.console_db.console_db:started:13 - STARTED Metadata(status='no archiver', metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 27, 46, 785304, tzinfo=datetime.timezone.utc), 'url': 'https://x.com/SozinhoRamalho/status/1876710769913450647'}, media=[])
+2025-02-20 15:27:46.785 | INFO     | auto_archiver.core.orchestrator:archive:483 - Trying extractor generic_extractor for https://x.com/SozinhoRamalho/status/1876710769913450647
+[twitter] Extracting URL: https://x.com/SozinhoRamalho/status/1876710769913450647
+...
+2025-02-20 15:27:53.134 | INFO     | auto_archiver.modules.local_storage.local_storage:upload:26 - ./local_archive/https-x-com-sozinhoramalho-status-1876710769913450647/06e8bacf27ac4bb983bf6280.html
+2025-02-20 15:27:53.135 | SUCCESS  | auto_archiver.modules.console_db.console_db:done:23 - DONE Metadata(status='yt-dlp_Twitter: success', 
+metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 27, 48, 564738, tzinfo=datetime.timezone.utc), 'url': 
+'https://x.com/SozinhoRamalho/status/1876710769913450647', 'title': 'ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1', 
+...
+```
+
+
+### Finishing Touches
+
+You've now successfully exported your cookies from a logged-in session in your browser, and used them to authenticate with Twitter and download a sensitive tweet. Congratulations!
+
+Finally,Some important things to remember:
+
+1. It's best not to use your own personal account for archiving. [Here's why](../installation/authentication.md#recommendations-for-authentication).
+2. Cookies can be short-lived, so may need updating. Sometimes, a website session may 'expire' or a website may force you to login again. In these instances, you'll need to repeat the export step (step 2) after logging in again to update your cookies.
+
+## Authenticating on XXXX site with username/password
+
+```{note} This section is still under construction 🚧
+```
--- a/docs/source/how_to/extract_cookies.png
+++ b/docs/source/how_to/extract_cookies.png
--- a/docs/source/how_to/gsheets_setup.md
+++ b/docs/source/how_to/gsheets_setup.md
@ -0,0 +1,122 @@
+# Using Google Sheets
+
+This guide explains how to set up Google Sheets to process URLs automatically and then store the archiving status back into the Google sheet. It is broadly split into 3 steps:
+
+1. Setting up your Google Sheet
+2. Setting up a service account so Auto Archiver can access the sheet
+3. Setting the Auto Archiver settings
+
+### 1. Setting up your Google Sheet
+
+Any Google sheet must have at least *one* column, with the name 'link' (you can change this name afterwards). This is the column with the URLs that you want the Auto Archiver to archive. Your sheet can have many other columns that the Auto Archiver can use, and you can also include any other columns for your own personal use.
+
+We recommend copying [this template Google Sheet](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?usp=sharing) as a starting point for your project.
+
+Here's an overview of all the columns, and what a complete sheet would look like.
+
+Inputs:
+
+* **Link** *(required)*: the URL of the post to archive
+* **Destination folder**: custom folder for archived file (regardless of storage)
+
+Outputs:
+* **Archive status** *(required)*: Status of archive operation
+* **Archive location**: URL of archived post
+* **Archive date**: Date archived
+* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet
+* **Timestamp**: Timestamp of original post
+* **Title**: Post title
+* **Text**: Post text
+* **Screenshot**: Link to screenshot of post
+* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
+* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
+* **WACZ**: Link to a WACZ web archive of post
+* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
+
+For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.)
+
+![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png)
+
+We'll change the name of the 'Destination Folder' column in step 3.
+
+## 2. Setting up your Service Account
+
+Once your Google Sheet is set up, you need to create what's called a 'service account' that will allow the Auto Archiver to access it.
+
+To do this, follow the steps in [this guide](https://gspread.readthedocs.io/en/latest/oauth2.html) all the way up until step 8. You should have downloaded a file called `service_account.json` and shared the Google Sheet with the log 'client_email' email address in this file.
+
+Once you've downloaded the file, save it to `secrets/service_account.json`
+
+## 3. Setting up the configuration file
+
+Now that you've set up your Google sheet, and you've set up the service account so Auto Archiver can access the sheet, the final step is to set your configuration.
+
+First, make sure you have `gsheet_feeder` set in the `steps.feeders` section of your config. If you wish to store the results of the archiving process back in your Google sheet, make sure to also set the `ghseet_db` settig in the `steps.databases` section. Here's how this might look:
+
+```{code} yaml
+steps:
+    feeders:
+    - gsheet_feeder
+    ...
+    databases:
+    - gsheet_db # optional, if you also want to store the results in the Google sheet
+    ...
+```
+
+Next, set up the `gsheet_feeder` configuration settings in the 'Configurations' part of the config `orchestration.yaml` file. Open up he file, and set the `gsheet_feeder.sheet` setting or the `gsheet_feeder.sheet_id` setting. The `sheet` should be the name of your sheet, as it shows in the top left of the sheet. For example, the sheet [here](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) is called 'Public Auto Archiver template'.
+
+Here's how this might look:
+
+```{code} yaml
+...
+gsheet_feeder:
+    sheet: 'My Awesome Sheet'
+    ...
+```
+
+You can also pass these settings directly on the command line without having to edit the file, here'a an example of how to do that (using docker):
+
+`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder.sheet "Auto archive test 2023-2"`. 
+
+Here, the sheet name has been overridden/specified in the command line invocation.
+
+### 3a. (Optional) Changing the column names
+
+In step 1, we said we would change the name of the 'Destination Folder'. Perhaps you don't like this name, or already have a sheet with a different name. In our example here, we want to name this column 'Save Folder'. To do this, we need to edit the `ghseet_feeder.column` setting in the configuration file. For more information on this setting, see the [Gsheet Feeder docs](../modules/autogen/feeder/gsheet_feeder.md#configuration-options). We will first copy the default settings from the Gsheet Feeder docs for the 'column' settings, and then edit the 'Destination Folder' section to rename it 'Save Folder'. Our final configuration section looks like:
+
+```{code} yaml
+...
+gsheet_feeder:
+    sheet: 'My Awesome Sheet'
+    columns:
+      url: link
+      status: archive status
+      folder: save folder # <-- note how this value has been changed
+      archive: archive location
+      date: archive date
+      thumbnail: thumbnail
+      timestamp: upload timestamp
+      title: upload title
+      text: text content
+      screenshot: screenshot
+      hash: hash
+      pdq_hash: perceptual hashes
+      wacz: wacz
+      replaywebpage: replaywebpage
+```
+
+## Viewing the Results after archiving
+
+With the `ghseet_db` installed, once you start running the Auto Archiver, it will updates the "Archive status" column.
+
+![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png)
+
+The links are downloaded and archived, and the spreadsheet is updated to the following:
+
+![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png)
+
+Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
+
+The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive.
+
+![The archive result for a link in the demo sheet.](../demo-archive.png)
--- a/docs/source/how_to/logging.md
+++ b/docs/source/how_to/logging.md
@ -0,0 +1,71 @@
+# Keeping Logs
+
+Auto Archiver's logs can be helpful for debugging problematic archiving processes. This guide shows you how to use the logs to 
+
+## Setting up logging
+
+Logging settings can be set on the command line or using the orchestration config file ([learn more](../installation/configuration)). A special `logging` section defines the logging options.
+
+#### Enabling or Disabling Logging
+
+Logging to the console is enabled by default. If you want to globally disable Auto Archiver's logging, then you can set `enabled: false` in your `logging` config:
+
+```{code} yaml
+
+...
+logging:
+   enabled: false
+...
+```
+
+```{note}
+This will disable all logs from Auto Archiver, but it does not disable logs for other tools that the Auto Archiver uses (for example: yt-dlp, firefox or ffmpeg). These logs will still appear in your console.
+```
+
+#### Logging Level
+
+There are 7 logging levels in total, with 4 commonly used levels. They are: `DEBUG`, `INFO`, `WARNING` and `ERROR`.
+
+Change the warning level by setting the value in your orchestration config file:
+
+```{code} yaml
+:caption: orchestration.yaml
+
+...
+logging:
+    level: DEBUG # or INFO / WARNING / ERROR
+...
+```
+
+For normal usage, it is recommended to use the `INFO` level, or if you prefer quieter logs with less information, you can use the `WARNING` level. If you encounter issues with the archiving, then it's recommended to enable the `DEBUG` level.
+
+```{note} To learn about all logging levels, see the [loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html)
+```
+
+### Logging to a file
+
+As default, auto-archiver will log to the console. But if you wish to store your logs for future reference, or you are running the auto-archiver from within code a implementation, then you may with to enable file logging. This can be done by setting the `file:` config value in the logging settings.
+
+**Rotation:** For file logging, you can choose to 'rotate' your log files (creating new log files) so they do not get too large. Change this by setting the 'rotation' option in your logging settings. For a full list of rotation options, see the [loguru docs](https://loguru.readthedocs.io/en/stable/overview.html#easier-file-logging-with-rotation-retention-compression).
+
+```{code} yaml
+:caption: orchestration.yaml
+
+logging:
+    ...
+    file: /my/log/file.log
+    rotation: 1 day
+```
+
+### Full logging example
+
+The below example logs only `WARNING` logs to the console and to the file `/my/file.log`, rotating that file once per week:
+
+```{code} yaml
+:caption: orchestration.yaml
+
+logging:
+    level: WARNING
+    file: /my/file.log
+    rotation: 1 week
+```
--- a/docs/source/how_to/new_config_format.md
+++ b/docs/source/how_to/new_config_format.md
@ -0,0 +1,127 @@
+# Upgrading to v0.13
+
+```{note} This how-to is only relevant for people who used Auto Archiver before February 2025 (versions prior to 0.13).
+
+If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you.
+```
+
+Version 0.13 of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications.
+
+## How do I know if I need to update my configuration format?
+
+There are two simple ways to check if you need to update your format:
+
+1. When you try and run auto-archiver using your existing configuration file, you get an error about no feeders or formatters being configured, like:
+
+```{code} console
+AssertionError: No feeders were configured. Make sure to set at least one feeder in
+your configuration file or on the command line (using --feeders)
+```
+
+2. Within your configuration file, you have a `feeder:` option. This is the old format. An example old format:
+```{code} yaml
+
+steps:
+  feeder: gsheet_feeder
+...
+```
+
+## Updating your configuration file
+
+To update your configuration file, you can either:
+
+### 1. Manually edit the configuration file and change the values.
+
+This is recommended if you want to keep all your old settings. Follow the steps below to change the relevant settings:
+
+#### a) Feeder & Formatter Steps Settings
+
+The feeder and formatter settings have been changed from a single string to a list.
+
+- `steps.feeder (string)` → `steps.feeders (list)`
+- `steps.formatter (string)` → `steps.formatters (list)`
+
+Example:
+
+```{code} yaml
+
+steps:
+   feeder: cli_feeder
+   ...
+   formatter: html_formatter
+
+# the above should be changed to:
+steps:
+   feeders:
+   - cli_feeder
+   ...
+   formatters:
+   - html_formatter
+```
+
+```{note} Auto Archiver still only supports one feeder and formatter, but from v0.13 onwards they must be added to the configuration file as a list.
+```
+
+#### b) Extractor (formerly Archiver) Steps Settings
+
+With v0.13 of Auto Archiver, `archivers` have been renamed to `extractors` to better reflect what they actually do - extract information from a URL. Change the configuration by renaming:
+
+- `steps.archivers` → `steps.extractors`
+
+The names of the actual modules have also changed, so for any extractor modules you have enabled, you will need to rename the `archiver` part to `extractor`. Some examples:
+
+- `telethon_archiver` → `telethon_extractor`
+- `wacz_archiver_enricher` → `wacz_extractor_enricher`
+- `wayback_archiver_enricher` → `wayback_extractor_enricher`
+- `vk_archiver` → `vk_extractor`
+
+Additionally, the `youtube_archiver` has been renamed to `generic_extractor` as it is considered the default/fallback extractor. Read more about the [generic extractor](../modules/autogen/extractor/generic_extractor.md).
+
+Example:
+```{code} yaml
+steps:
+   ...
+   archivers:
+   - telethon_archiver
+   - youtube_archiver
+   - vk_archiver
+
+# renaming 'archiver' to 'extractor', and renaming the youtube_archiver the above config will become:
+steps:
+   ...
+   extractors:
+   - telethon_extractor
+   - vk_extractor
+   - generic_extractor
+
+```
+
+#### c) Redundant / Obsolete Modules
+
+With v0.13 of Auto Archiver, the following modules have been removed and their features have been built in to the generic_extractor. You should remove them from the 'steps' section of your configuration file:
+
+* `twitter_archiver` - use the `generic_extractor` for general extraction, or the `twitter_api_extractor` for API access.
+* `tiktok_archiver` - use the `generic_extractor` to extract TikTok videos.
+
+
+### 2. Auto-generate a new config, then copy over your settings.
+
+Using this method, you can have Auto Archiver auto-generate a configuration file for you, then you can copy over the desired settings from your old config file. This is probably the easiest method and quickest to setup, but it may require some trial and error as you copy over your settings.
+
+First, move your existing `orchestration.yaml` file to a different folder or rename it.
+
+Then, you can generate a `simple` or `full` config using:
+
+```{code} console
+>>> # generate a simple config
+>>> auto-archiver 
+>>> # config will be written to orchestration.yaml
+>>> 
+>>> # generate a full config
+>>> auto-archiver --mode=full
+>>> 
+```
+
+After this, copy over any settings from your old config to the new config.
+
+
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -8,10 +8,10 @@
 :caption: Contents:

 Overview <self>
-contributing
-installation/installation.rst
+installation/setup
 core_modules.md
 how_to
+contributing
 development/developer_guidelines
 autoapi/index.rst
 ```
--- a/docs/source/installation/authentication.md
+++ b/docs/source/installation/authentication.md
@ -4,22 +4,42 @@ The Authentication framework for auto-archiver allows you to add login details f

 There are two main use cases for authentication:
 * Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
-* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this.
+* Some websites use anti-bot systems to block bot-like tools from accessing the website. Adding real login information to auto-archiver can sometimes bypass this.

 ## The Authentication Config

-You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same.
+You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. Currently, auto-archiver supports the following authentication types:
+
+**Username & Password:**
+- `username`: str - the username to use for login
+- `password`: str - the password to use for login
+
+**API**
+- `api_key`: str - the API key to use for login
+- `api_secret`: str - the API secret to use for login
+  
+**Cookies**
+- `cookie`: str - a cookie string to use for login (specific to this site)
+- `cookies_from_browser`: str - load cookies from this browser, for this site only.
+- `cookies_file`: str - load cookies from this file, for this site only.
+
+```{note} 
+
+The Username & Password, and API settings only work with the Generic Extractor. Other modules (like the screenshot enricher) can only use the `cookies` options. Furthermore, many sites can still detect bots and block username/password logins. Twitter/X and YouTube are two prominent ones that block username/password logging.
+
+One of the 'Cookies' options is recommended for the most robust archiving.
+```

 ```{code} yaml
 authentication:
   # optional file to load authentication information from, for security or multi-system deploy purposes
   load_from_file: path/to/authentication/file.txt
-   # optional setting to load cookies from the named browser on the system.
+   # optional setting to load cookies from the named browser on the system, for **ALL** websites
   cookies_from_browser: firefox
-   # optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these
+   # optional setting to load cookies from a cookies.txt/cookies.jar file, for **ALL** websites. See note below on extracting these
   cookies_file: path/to/cookies.jar

-   twitter.com,x.com:
+   mysite.com:
      username: myusername
      password: 123
    
@ -29,15 +49,10 @@ authentication:
    othersite.com:
       api_key: 123
       api_secret: 1234
-
-# All available options:
-  # - username: str - the username to use for login
-  # - password: str - the password to use for login
-  # - api_key: str - the API key to use for login
-  # - api_secret: str - the API secret to use for login
-  # - cookie: str - a cookie string to use for login (specific to this site)
+  
 ```

+
 ### Recommendations for authentication

 1. **Store authentication information separately:**
--- a/docs/source/installation/configurations.md
+++ b/docs/source/installation/configurations.md
@ -1,13 +1,18 @@

 # Configuration

-This section of the documentation provides guidelines for configuring the tool.
+The recommended way to configure auto-archiver for first-time users is to [run the Auto Archiver](setup.md#running) and have it auto-generate a default configuration for you. Then, if needed, you can edit the configuration file using one of the following methods.

-## Configuring using a file

-The recommended way to configure auto-archiver for long-term and deployed projects is a configuration file, typically called `orchestration.yaml`. This is a YAML file containing all the settings for your entire workflow.
+## 1. Configuration file

-The structure of orchestration file is split into 2 parts: `steps` (what [steps](../flow_overview.md) to use) and `configurations` (settings for different modules), here's a simplification:
+The configuration file is typically called `orchestration.yaml` and stored in the `secrets` folder on your desktop. The configuration file contains all the settings for your entire Auto Archiver workflow in one easy-to-find place.
+
+If you want to have Auto Archiver run with the recommended 'basic' setup, 
+
+### Advanced Configuration
+
+The structure of orchestration file is split into 2 parts: `steps` (what [steps](../flow_overview.md) to use) and `configurations` (settings for individual modules).

 A default `orchestration.yaml` will be created for you the first time you run auto-archiver (without any arguments). Here's what it looks like:

@ -21,9 +26,9 @@ A default `orchestration.yaml` will be created for you the first time you run au

 </details>

-## Configuring from the Command Line
+## 2. Command Line configuration

-You can run auto-archiver directy from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`.
+You can run auto-archiver directly from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`.

 The command line arguments are useful for testing or editing config values and enabling/disabling modules on the fly. When you are happy with your settings, you can store them back in your configuration file by passing the `-s/--store` flag on the command line.

--- a/docs/source/installation/installation.md
+++ b/docs/source/installation/installation.md
@ -1,80 +1,44 @@
-# Installing Auto Archiver
+# Installation

-```{toctree}
-:depth: 1
-:hidden:
+There are 3 main ways to use the auto-archiver. We recommend the 'docker' method for most uses. This installs all the requirements in one command.

-configurations.md
-config_cheatsheet.md
-```
-
-There are 3  main ways to use the auto-archiver:
-1. Easiest: [via docker](#installing-with-docker)
+1. Easiest (recommended): [via docker](#installing-with-docker)
 2. Local Install: [using pip](#installing-locally-with-pip)
 3. Developer Install: [see the developer guidelines](../development/developer_guidelines)

-
-But **you always need a configuration/orchestration file**, which is where you'll configure where/what/how to archive. Make sure you read [orchestration](#orchestration).
-
-
-## Installing with Docker
+## 1. Installing with Docker

 [![dockeri.co](https://dockerico.blankenship.io/image/bellingcat/auto-archiver)](https://hub.docker.com/r/bellingcat/auto-archiver)

-Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag.
+Docker works like a virtual machine running inside your computer, making installation simple. You'll need to first set up Docker, and then download the Auto Archiver 'image':


-1. Install [docker](https://docs.docker.com/get-docker/)
-2. Pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
-3. Run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
-   1. `docker run` tells docker to start a new container (an instance of the image)
-   2. `--rm` makes sure this container is removed after execution (less garbage locally)
-   3. `-v $PWD/secrets:/app/secrets` - your secrets folder
-      1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container
-      2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use
-      3. `/app/secrets` points to the path the docker container where this image can be found
-   4.  `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage
-       1.  `-v` same as above, this is a volume instruction
-       2.  `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
-       3.  `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file 
+**a) Download and install docker**

-### Example invocations
+Go to the [Docker website](https://docs.docker.com/get-docker/) and download right version for your operating system. 

-The invocations below will run the auto-archiver Docker image using a configuration file that you have specified
+**b) Pull the Auto Archiver docker image**
+
+Open your command line terminal, and copy-paste / type:

 ```bash
-# all the configurations come from ./secrets/orchestration.yaml
-docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml
-# uses the same configurations but for another google docs sheet 
-# with a header on row 2 and with some different column names
-# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
-docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
-# all the configurations come from orchestration.yaml and specifies that s3 files should be private
-docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
+docker pull bellingcat/auto-archiver
 ```

-## Installing Locally with Pip
+This will download the docker image, which may take a while.
+
+That's it, all done! You're now ready to set up [your configuration file](configurations.md). Or, if you want to use the recommended defaults, then you can [run Auto Archiver immediately](setup.md#running-a-docker-install).
+
+------------
+
+## 2. Installing Locally with Pip

 1. Make sure you have python 3.10 or higher installed
 2. Install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver`
 3. Test it's installed with `auto-archiver --help`
-4. Install other local dependency requirements (for )
-5. Run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
+4. Install other local dependency requirements (for example `ffmpeg`, `firefox`)

-### Example invocations
-
-Once all your [local requirements](#installing-local-requirements) are correctly installed, the
-
-```bash
-# all the configurations come from ./secrets/orchestration.yaml
-auto-archiver --config secrets/orchestration.yaml
-# uses the same configurations but for another google docs sheet 
-# with a header on row 2 and with some different column names
-# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
-auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
-# all the configurations come from orchestration.yaml and specifies that s3 files should be private
-auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
-```
+After this, you're ready to set up your [your configuration file](configurations.md), or if you want to use the recommended defaults, then you can [run Auto Archiver immediately](setup.md#running-a-local-install).

 ### Installing Local Requirements

--- a/docs/source/installation/requirements.md
+++ b/docs/source/installation/requirements.md
@ -0,0 +1,14 @@
+# Requirements
+
+Using the Auto Archiver is very simple, but ideally you have some familiarity with using the command line to run programs. ([Command line crash course](https://developer.mozilla.org/en-US/docs/Learn_web_development/Getting_started/Environment_setup/Command_line)).
+
+### System Requirements
+
+* Auto Archiver works on any Windows, macOS and Linux computer
+* If you're using the **local install** method, then you should make sure to have python3.10+ installed
+
+### Storage Requirements
+
+By default, Auto Archiver uses your local computer storage for any downloaded media (videos, images etc.). If you're downloading large files, this may take up a lot of your local computer's space (more than 5GB of space).
+
+If your storage space is limited, then you may want to set up an [alternative storage method](../modules/storage.md) for your media.
--- a/docs/source/installation/setup.md
+++ b/docs/source/installation/setup.md
@ -0,0 +1,76 @@
+# Getting Started
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+installation.md
+configurations.md
+authentication.md
+requirements.md
+config_cheatsheet.md
+```
+
+## Getting Started
+
+To get started with Auto Archiver, there are 3 main steps you need to complete.
+
+1. [Install Auto Archiver](installation.md)
+2. [Setup up your configuration](configurations.md) (if you are ok with the default settings, you can skip this step)
+3. Run the archiving process<a id="running"></a>
+
+The way you run the Auto Archiver depends on how you installed it (docker install or local install)
+
+### Running a Docker Install
+
+If you installed Auto Archiver using docker, open up your terminal, and copy-paste / type the following command:
+
+```bash
+docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
+ ```
+
+breaking this command down:
+   1. `docker run` tells docker to start a new container (an instance of the image)
+   2. `--rm` makes sure this container is removed after execution (less garbage locally)
+   3. `-v $PWD/secrets:/app/secrets` - your secrets folder with settings
+      1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container
+      2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use
+      3. `/app/secrets` points to the path the docker container where this image can be found
+   4.  `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage
+       1.  `-v` same as above, this is a volume instruction
+       2.  `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
+       3.  `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file 
+
+### Example invocations
+
+The invocations below will run the auto-archiver Docker image using a configuration file that you have specified
+
+```bash
+# Have auto-archiver run with the default settings, generating a settings file in ./secrets/orchestration.yaml
+docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
+
+# uses the same configuration, but with the `gsheet_feeder`, a header on row 2 and with some different column names
+# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
+docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
+# Runs auto-archiver for the first time, but in 'full' mode, enabling all modules to get a full settings file
+docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --mode full
+```
+
+------------
+
+### Running a Local Install
+
+### Example invocations
+
+Once all your [local requirements](#installing-local-requirements) are correctly installed, the
+
+```bash
+# all the configurations come from ./secrets/orchestration.yaml
+auto-archiver --config secrets/orchestration.yaml
+# uses the same configurations but for another google docs sheet 
+# with a header on row 2 and with some different column names
+# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
+auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
+# all the configurations come from orchestration.yaml and specifies that s3 files should be private
+auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
+```
--- a/docs/source/modules/database.md
+++ b/docs/source/modules/database.md
@ -8,7 +8,7 @@ The default (enabled) databases are the CSV Database and the Console Database.
 ```

 ```{toctree}
-:depth: 1
+:maxdepth: 1
 :hidden:
 :glob:
 autogen/database/*
--- a/docs/source/modules/enricher.md
+++ b/docs/source/modules/enricher.md
@ -7,7 +7,7 @@ Enricher modules are used to add additional information to the items  that have
 ```

 ```{toctree}
-:depth: 1
+:maxdepth: 1
 :hidden:
 :glob:
 autogen/enricher/*
--- a/docs/source/modules/extractor.md
+++ b/docs/source/modules/extractor.md
@ -4,14 +4,14 @@ Extractor modules are used to extract the content of a given URL. Typically, one

 Extractors that are able to extract content from a wide range of websites include:
 1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library.
-2. Wayback Machine Extractor: sends pages to the Waygback machine for archiving, and stores the link.
+2. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the link.
 3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format. 

 ```{include} autogen/extractor.md
 ```

 ```{toctree}
-:depth: 1
+:maxdepth: 1
 :hidden:
 :glob:
 autogen/extractor/*
--- a/docs/source/modules/feeder.md
+++ b/docs/source/modules/feeder.md
@ -1,8 +1,8 @@
 # Feeder Modules

-Feeder modules are used to feed URLs into the `auto-archiver` for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line.
+Feeder modules are used to feed URLs into the Auto Archiver for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line.

-The default feeder is the command line feeder (`cli_feeder`), which allows you to input URLs directly into the `auto-archiver` from the command line.
+The default feeder is the command line feeder (`cli_feeder`), which allows you to input URLs directly into `auto-archiver` from the command line.

 Command line feeder usage:
 ```{code} bash
@ -13,7 +13,7 @@ auto-archiver [options] -- URL1 URL2 ...
 ```

 ```{toctree}
-:depth: 1
+:maxdepth: 1
 :glob:
 :hidden:
 autogen/feeder/*
--- a/docs/source/modules/formatter.md
+++ b/docs/source/modules/formatter.md
@ -6,7 +6,7 @@ Formatter modules are used to format the data extracted from a URL into a specif
 ```

 ```{toctree}
-:depth: 1
+:maxdepth: 1
 :hidden:
 :glob:
 autogen/formatter/*
--- a/docs/source/modules/storage.md
+++ b/docs/source/modules/storage.md
@ -8,7 +8,7 @@ The default is to store the files downloaded (e.g. images, videos) in a local di
 ```

 ```{toctree}
-:depth: 1
+:maxdepth: 1
 :hidden:
 :glob:
 autogen/storage/*
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@ -51,7 +51,6 @@ class BaseModule(ABC):
    def config_setup(self, config: dict):

        authentication = config.get('authentication', {})
-
        # this is important. Each instance is given its own deepcopied config, so modules cannot
        # change values to affect other modules
        config = deepcopy(config)
@ -86,11 +85,13 @@ class BaseModule(ABC):
        * api_key: str - the API key to use for login\n
        * api_secret: str - the API secret to use for login\n
        * cookie: str - a cookie string to use for login (specific to this site)\n
+        * cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
+        * cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
        """
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?

-        site = UrlUtil.domain_for_url(site)
+        site = UrlUtil.domain_for_url(site).lstrip("www.")
        # add the 'www' version of the site to the list of sites to check
        authdict = {}

@ -116,17 +117,30 @@ class BaseModule(ABC):
            # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
            ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
            return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
+        
+        get_cookiejar_options = None

-        # get the cookies jar, prefer the browser cookies than the file
-        if 'cookies_from_browser' in self.authentication:
+        # order of priority:
+        # 1. cookies_from_browser setting in site config
+        # 2. cookies_file setting in site config
+        # 3. cookies_from_browser setting in global config
+        # 4. cookies_file setting in global config
+
+        if 'cookies_from_browser' in authdict:
+            get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
+        elif 'cookies_file' in authdict:
+            get_cookiejar_options = ['--cookies', authdict['cookies_file']]
+        elif 'cookies_from_browser' in self.authentication:
            authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
-            if extract_cookies:
-                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
+            get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
        elif 'cookies_file' in self.authentication:
            authdict['cookies_file'] = self.authentication['cookies_file']
-            if extract_cookies:
-                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
+            get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
+
        
+        if get_cookiejar_options:
+            authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
+
        return authdict
    
    def repr(self):
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -7,6 +7,7 @@ flexible setup in various environments.

 import argparse
 from ruamel.yaml import YAML, CommentedMap, add_representer
+import json

 from loguru import logger

@ -17,10 +18,12 @@ from typing import Any, List, Type, Tuple

 _yaml: YAML = YAML()

+DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
+
 EMPTY_CONFIG = _yaml.load("""
 # Auto Archiver Configuration
-# Steps are the modules that will be run in the order they are defined

+# Steps are the modules that will be run in the order they are defined
 steps:""" + "".join([f"\n   {module}s: []" for module in MODULE_TYPES]) + \
 """

@ -52,6 +55,57 @@ logging:
 """)
 # note: 'logging' is explicitly added above in order to better format the config file

+
+# Arg Parse Actions/Classes
+class AuthenticationJsonParseAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+
+        try:
+            auth_dict = json.loads(values)
+            setattr(namespace, self.dest, auth_dict)
+        except json.JSONDecodeError as e:
+            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
+
+        def load_from_file(path):
+            try:
+                with open(path, 'r') as f:
+                    try:
+                        auth_dict = json.load(f)
+                    except json.JSONDecodeError:
+                        f.seek(0)
+                        # maybe it's yaml, try that
+                        auth_dict = _yaml.load(f)
+                    if auth_dict.get('authentication'):
+                        auth_dict = auth_dict['authentication']
+                    auth_dict['load_from_file']  = path
+                    return auth_dict
+            except:
+                return None
+
+        if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
+            auth_dict = load_from_file(auth_dict['from_file'])
+        elif isinstance(auth_dict, str):
+            # if it's a string
+            auth_dict = load_from_file(auth_dict)
+        
+        if not isinstance(auth_dict, dict):
+            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
+        for key, auth in auth_dict.items():
+            if key in global_options:
+                continue
+            if not isinstance(key, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
+
+        setattr(namespace, self.dest, auth_dict)
+
+
+class UniqueAppendAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        for value in values:
+            if value not in getattr(namespace, self.dest):
+                getattr(namespace, self.dest).append(value)
+
 class DefaultValidatingParser(argparse.ArgumentParser):

    def error(self, message):
@ -82,6 +136,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):

        return super().parse_known_args(args, namespace)

+# Config Utils

 def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
    dotdict = {}
@ -153,8 +208,8 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
        pass

    if not config:
-        config = EMPTY_CONFIG
-    
+        config = deepcopy(EMPTY_CONFIG)
+
    return config

 # TODO: make this tidier/find a way to notify of which keys should not be stored
@ -170,4 +225,7 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:

    config_to_save.pop('urls', None)
    with open(yaml_filename, "w", encoding="utf-8") as outf:
-        _yaml.dump(config_to_save, outf)
+        _yaml.dump(config_to_save, outf)
+
+def is_valid_config(config: CommentedMap) -> bool:
+    return config and config != EMPTY_CONFIG
--- a/src/auto_archiver/core/enricher.py
+++ b/src/auto_archiver/core/enricher.py
@ -13,7 +13,7 @@ from abc import abstractmethod
 from auto_archiver.core import Metadata, BaseModule

 class Enricher(BaseModule):
-    """Base classes and utilities for enrichers in the Auto-Archiver system.
+    """Base classes and utilities for enrichers in the Auto Archiver system.
    
    Enricher modules must implement the `enrich` method to define their behavior.
    """
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@ -134,7 +134,6 @@ class LazyBaseModule:

    """
    name: str
-    type: list
    description: str
    path: str
    module_factory: ModuleFactory
@ -148,6 +147,10 @@ class LazyBaseModule:
        self.path = path
        self.module_factory = factory

+    @property
+    def type(self):
+        return self.manifest['type']
+
    @property
    def entry_point(self):
        if not self._entry_point and not self.manifest['entry_point']:
@ -183,10 +186,9 @@ class LazyBaseModule:
            try:
                manifest.update(ast.literal_eval(f.read()))
            except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
-                logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
+                raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
            
        self._manifest = manifest
-        self.type = manifest['type']
        self._entry_point = manifest['entry_point']
        self.description = manifest['description']
        self.version = manifest['version']
@ -254,7 +256,7 @@ class LazyBaseModule:
        instance.module_factory = self.module_factory
        
        # merge the default config with the user config
-        default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
+        default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)

        config[self.name] = default_config  | config.get(self.name, {})
        instance.config_setup(config)
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -6,95 +6,31 @@

 from __future__ import annotations
 from typing import Generator, Union, List, Type, TYPE_CHECKING
-from urllib.parse import urlparse
-from ipaddress import ip_address
-from copy import copy
 import argparse
 import os
 import sys
-import json
 from tempfile import TemporaryDirectory
 import traceback
+from copy import copy

 from rich_argparse import RichHelpFormatter
-
+from loguru import logger

 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
-from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
+    DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
 from .module import ModuleFactory, LazyBaseModule
 from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .consts import MODULE_TYPES
-from loguru import logger
+from auto_archiver.utils.url import check_url_or_raise

 if TYPE_CHECKING:
    from .base_module import BaseModule
    from .module import LazyBaseModule

-DEFAULT_CONFIG_FILE = "orchestration.yaml"
-
-
-class JsonParseAction(argparse.Action):
-    def __call__(self, parser, namespace, values, option_string=None):
-        try:
-            setattr(namespace, self.dest, json.loads(values))
-        except json.JSONDecodeError as e:
-            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
-
-
-class AuthenticationJsonParseAction(JsonParseAction):
-    def __call__(self, parser, namespace, values, option_string=None):
-        super().__call__(parser, namespace, values, option_string)
-        auth_dict = getattr(namespace, self.dest)
-
-        def load_from_file(path):
-            try:
-                with open(path, 'r') as f:
-                    try:
-                        auth_dict = json.load(f)
-                    except json.JSONDecodeError:
-                        f.seek(0)
-                        # maybe it's yaml, try that
-                        auth_dict = _yaml.load(f)
-                    if auth_dict.get('authentication'):
-                        auth_dict = auth_dict['authentication']
-                    auth_dict['load_from_file']  = path
-                    return auth_dict
-            except:
-                return None
-
-        if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
-            auth_dict = load_from_file(auth_dict['from_file'])
-        elif isinstance(auth_dict, str):
-            # if it's a string
-            auth_dict = load_from_file(auth_dict)
-        
-        if not isinstance(auth_dict, dict):
-            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
-        global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
-        for key, auth in auth_dict.items():
-            if key in global_options:
-                continue
-            if not isinstance(key, str) or not isinstance(auth, dict):
-                raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
-        
-        # extract out concatenated sites
-        for key, val in copy(auth_dict).items():
-            if "," in key:
-                for site in key.split(","):
-                    auth_dict[site] = val
-                del auth_dict[key]
-
-        setattr(namespace, self.dest, auth_dict)
-
-
-class UniqueAppendAction(argparse.Action):
-    def __call__(self, parser, namespace, values, option_string=None):
-        for value in values:
-            if value not in getattr(namespace, self.dest):
-                getattr(namespace, self.dest).append(value)
-
-
+class SetupError(ValueError):
+    pass
 class ArchivingOrchestrator:

    # instance variables
@ -163,7 +99,7 @@ class ArchivingOrchestrator:
        # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
        # but should we add them? Or should we just add them to the 'complete' parser?

-        if yaml_config != EMPTY_CONFIG:
+        if is_valid_config(yaml_config):
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
@ -189,7 +125,13 @@ class ArchivingOrchestrator:
                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
        else:
            # load all modules, they're not using the 'simple' mode
-            self.add_individual_module_args(self.module_factory.available_modules(), parser)
+            all_modules = self.module_factory.available_modules()
+            # add all the modules to the steps
+            for module in all_modules:
+                for module_type in module.type:
+                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
+
+            self.add_individual_module_args(all_modules, parser)
        
        parser.set_defaults(**to_dot_notation(yaml_config))

@ -198,6 +140,9 @@ class ArchivingOrchestrator:
        # merge the new config with the old one
        config = merge_dicts(vars(parsed), yaml_config)

+        # set up the authentication dict as needed
+        config = self.setup_authentication(config)
+
        # clean out args from the base_parser that we don't want in the config
        for key in vars(basic_config):
            config.pop(key, None)
@ -286,14 +231,20 @@ class ArchivingOrchestrator:
        self.basic_parser.exit()

    def setup_logging(self, config):
+
+        logging_config = config['logging']
+
+        if logging_config.get('enabled', True) is False:
+            # disabled logging settings, they're set on a higher level
+            logger.disable('auto_archiver')
+            return
+
        # setup loguru logging
        try:
            logger.remove(0)  # remove the default logger
        except ValueError:
            pass

-        logging_config = config['logging']
-
        # add other logging info
        if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
            self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
@ -312,27 +263,25 @@ class ArchivingOrchestrator:

            step_items = []
            modules_to_load = modules_by_type[f"{module_type}s"]
-            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
+            if not modules_to_load:
+                raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")

            def check_steps_ok():
                if not len(step_items):
-                    logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
                    if len(modules_to_load):
-                        logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}")
-                    exit()
+                        logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
+                    raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
+                

                if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
-                    logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
-                    exit()
+                    raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")

            for module in modules_to_load:
                if module == 'cli_feeder':
-                    # pseudo module, don't load it
+                    # cli_feeder is a pseudo module, it just takes the command line args for [URLS]
                    urls = self.config['urls']
                    if not urls:
-                        logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
-                        exit()
-                    # cli_feeder is a pseudo module, it just takes the command line args
+                        raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")

                    def feed(self) -> Generator[Metadata]:
                        for url in urls:
@ -352,13 +301,14 @@ class ArchivingOrchestrator:

                if module in invalid_modules:
                    continue
+
                try:
                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
                except (KeyboardInterrupt, Exception) as e:
                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
                    if module_type == 'extractor' and loaded_module.name == module:
                        loaded_module.cleanup()
-                    exit()
+                    raise e

                if not loaded_module:
                    invalid_modules.append(module)
@ -372,7 +322,7 @@ class ArchivingOrchestrator:
    def load_config(self, config_file: str) -> dict:
        if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
            logger.error(f"The configuration file {config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
-            exit()
+            raise FileNotFoundError(f"Configuration file {config_file} not found")

        return read_yaml(config_file)
    
@ -437,8 +387,12 @@ class ArchivingOrchestrator:
        If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
        To test configurations, without loading any modules you can also first call 'setup_configs'
        """
-        self.setup(args)
-        return self.feed()
+        try:
+            self.setup(args)
+            return self.feed()
+        except Exception as e:
+            logger.error(e)
+            exit(1)

    def cleanup(self) -> None:
        logger.info("Cleaning up")
@ -503,8 +457,8 @@ class ArchivingOrchestrator:

        original_url = result.get_url().strip()
        try:
-            self.assert_valid_url(original_url)
-        except AssertionError as e:
+            check_url_or_raise(original_url)
+        except ValueError as e:
            logger.error(f"Error archiving URL {original_url}: {e}")
            raise e

@ -564,26 +518,27 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")

        return result
+    

-    def assert_valid_url(self, url: str) -> bool:
+    def setup_authentication(self, config: dict) -> dict:
        """
-        Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
+        Setup authentication for all modules that require it
+
+        Split up strings into multiple sites if they are comma separated
        """
-        assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"

-        parsed = urlparse(url)
-        assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
-        assert parsed.hostname, f"Invalid URL hostname"
-        assert parsed.hostname != "localhost", f"Invalid URL"
+        authentication = config.get('authentication', {})

-        try:  # special rules for IP addresses
-            ip = ip_address(parsed.hostname)
-        except ValueError: pass
-        else:
-            assert ip.is_global, f"Invalid IP used"
-            assert not ip.is_reserved, f"Invalid IP used"
-            assert not ip.is_link_local, f"Invalid IP used"
-            assert not ip.is_private, f"Invalid IP used"
+        # extract out concatenated sites
+        for key, val in copy(authentication).items():
+            if "," in key:
+                for site in key.split(","):
+                    site = site.strip()
+                    authentication[site] = val
+                del authentication[key]
+        
+        config['authentication'] = authentication
+        return config

    # Helper Properties

--- a/src/auto_archiver/core/validators.py
+++ b/src/auto_archiver/core/validators.py
@ -1,6 +1,7 @@
 # used as validators for config values. Should raise an exception if the value is invalid.
 from pathlib import Path
 import argparse
+import json

 def example_validator(value):
    if "example" not in value:
@ -16,4 +17,7 @@ def positive_number(value):
 def valid_file(value):
    if not Path(value).is_file():
        raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
-    return value
+    return value
+
+def json_loader(cli_val):
+    return json.loads(cli_val)
--- a/src/auto_archiver/modules/api_db/manifest.py
+++ b/src/auto_archiver/modules/api_db/manifest.py
@ -1,5 +1,5 @@
 {
-    "name": "Auto-Archiver API Database",
+    "name": "Auto Archiver API Database",
    "type": ["database"],
    "entry_point": "api_db::AAApiDb",
    "requires_setup": True,
@ -39,7 +39,7 @@
        },
    },
    "description": """
-     Provides integration with the Auto-Archiver API for querying and storing archival data.
+     Provides integration with the Auto Archiver API for querying and storing archival data.

 ### Features
 - **API Integration**: Supports querying for existing archives and submitting results.
@ -49,6 +49,6 @@
 - **Optional Storage**: Archives results conditionally based on configuration.

 ### Setup
-Requires access to an Auto-Archiver API instance and a valid API token.
+Requires access to an Auto Archiver API instance and a valid API token.
     """,
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@ -280,6 +280,7 @@ class GenericExtractor(Extractor):
        
        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)
+
        # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
        if auth:
            if 'username' in auth and 'password' in auth:
@ -290,11 +291,11 @@ class GenericExtractor(Extractor):
                logger.debug(f'Using provided auth cookie for {url}')
                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
            elif 'cookies_from_browser' in auth:
-                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
+                logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
            elif 'cookies_file' in auth:
-                logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
-                ydl_options['cookiesfile'] = auth['cookies_file']
+                logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
+                ydl_options['cookiefile'] = auth['cookies_file']

        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@ -15,7 +15,8 @@
        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
-            "help": "service account JSON file path",
+            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
+            "required": True,
        },
        "columns": {
            "default": {
@ -34,16 +35,16 @@
                "wacz": "wacz",
                "replaywebpage": "replaywebpage",
            },
-            "help": "names of columns in the google sheet (stringified JSON object)",
-            "type": "auto_archiver.utils.json_loader",
+            "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
+            "type": "json_loader",
        },
        "allow_worksheets": {
            "default": set(),
-            "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+            "help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
        },
        "block_worksheets": {
            "default": set(),
-            "help": "(CSV) explicitly block some worksheets from being processed",
+            "help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
        },
        "use_sheet_names_in_stored_paths": {
            "default": True,
@ -64,8 +65,10 @@
    - Ensures only rows with valid URLs and unprocessed statuses are included for archival.
    - Supports organizing stored files into folder paths based on sheet and worksheet names.

-    ### Notes
-    - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
-    - Create the sheet using the template provided in the docs.
+    ### Setup
+    - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
+    To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
+    - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
+    - Customize the column names in your Google sheet using the `columns` configuration.
    """,
 }
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder):
    def setup(self) -> None:
        self.gsheets_client = gspread.service_account(filename=self.service_account)
        # TODO mv to validators
-        assert self.sheet or self.sheet_id, (
-            "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
-        )
+        if not self.sheet and not self.sheet_id:
+            raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")

    def open_sheet(self):
        if self.sheet:
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@ -18,7 +18,7 @@
            "channel_invites": {
                "default": {},
                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
-                "type": "auto_archiver.utils.json_loader",
+                "type": "json_loader",
            }
        },
    "description": """
--- a/src/auto_archiver/modules/wacz_enricher/init.py
+++ b/src/auto_archiver/modules/wacz_enricher/init.py
@ -1 +0,0 @@
-from .wacz_enricher import WaczExtractorEnricher
--- a/src/auto_archiver/modules/wacz_extractor_enricher/init.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/init.py
@ -0,0 +1 @@
+from .wacz_extractor_enricher import WaczExtractorEnricher
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@ -1,7 +1,7 @@
 {
-    "name": "WACZ Enricher",
+    "name": "WACZ Enricher (and Extractor)",
    "type": ["enricher", "extractor"],
-    "entry_point": "wacz_enricher::WaczExtractorEnricher",
+    "entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
    "requires_setup": True,
    "dependencies": {
        "python": [
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@ -1,5 +1,5 @@
 {
-    "name": "Wayback Machine Enricher",
+    "name": "Wayback Machine Enricher (and Extractor)",
    "type": ["enricher", "extractor"],
    "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
    "requires_setup": True,
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@ -59,10 +59,6 @@ def random_str(length: int = 32) -> str:
    return str(uuid.uuid4()).replace("-", "")[:length]


-def json_loader(cli_val):
-    return json.loads(cli_val)
-
-
 def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
    hash = hash_algo()
    with open(filename, "rb") as f:
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@ -1,5 +1,6 @@
 import re
 from urllib.parse import urlparse, urlunparse
+from ipaddress import ip_address


 AUTHWALL_URLS = [
@ -7,6 +8,43 @@ AUTHWALL_URLS = [
    re.compile(r"https:\/\/www\.instagram\.com"), # instagram
 ]

+
+def check_url_or_raise(url: str) -> bool | ValueError:
+    """
+    Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
+    """
+
+    
+    if not (url.startswith("http://") or url.startswith("https://")):
+        raise ValueError(f"Invalid URL scheme for url {url}")
+    
+    parsed = urlparse(url)
+    if not parsed.hostname:
+        raise ValueError(f"Invalid URL hostname for url {url}")
+    
+    if parsed.hostname == "localhost":
+        raise ValueError(f"Localhost URLs cannot be parsed for security reasons (for url {url})")
+    
+    if parsed.scheme not in ["http", "https"]:
+        raise ValueError(f"Invalid URL scheme, only http and https supported (for url {url})")
+
+    try:  # special rules for IP addresses
+        ip = ip_address(parsed.hostname)
+    except ValueError:
+        pass
+    
+    else:
+        if not ip.is_global:
+            raise ValueError(f"IP address {ip} is not globally reachable")
+        if ip.is_reserved:
+            raise ValueError(f"Reserved IP address {ip} used")
+        if ip.is_link_local:
+            raise ValueError(f"Link-local IP address {ip} used")
+        if ip.is_private:
+            raise ValueError(f"Private IP address {ip} used")
+    
+    return True
+
 def domain_for_url(url: str) -> str:
    """
    SECURITY: parse the domain using urllib to avoid any potential security issues
--- a/tests/enrichers/test_wacz_enricher.py
+++ b/tests/enrichers/test_wacz_enricher.py
@ -18,7 +18,7 @@ def wacz_enricher(setup_module, mock_binary_dependencies):
        "socks_proxy_port": None,
        "proxy_server": None,
    }
-    wacz = setup_module("wacz_enricher", configs)
+    wacz = setup_module("wacz_extractor_enricher", configs)
    return wacz


--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@ -68,7 +68,7 @@ class TestGenericExtractor(TestExtractorBase):
        "twitter.com/bellingcat/status/123",
        "https://www.youtube.com/watch?v=1"
    ])
-    def test_download_nonexistend_media(self, make_item, url):
+    def test_download_nonexistent_media(self, make_item, url):
        """
        Test to make sure that the extractor doesn't break on non-existend posts/media

--- a/tests/feeders/test_gsheet_feeder.py
+++ b/tests/feeders/test_gsheet_feeder.py
@ -9,7 +9,7 @@ from auto_archiver.core import Metadata, Feeder
 def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
    # Ensure setup() raises AssertionError if neither sheet nor sheet_id is set.
    mocker.patch("gspread.service_account")
-    with pytest.raises(AssertionError):
+    with pytest.raises(ValueError):
        setup_module(
            "gsheet_feeder",
            {"service_account": "dummy.json", "sheet": None, "sheet_id": None},
--- a/tests/test_implementation.py
+++ b/tests/test_implementation.py
@ -6,7 +6,9 @@ from auto_archiver.__main__ import main

@pytest.fixture
 def orchestration_file_path(tmp_path):
-    return (tmp_path / "example_orch.yaml").as_posix()
+    folder = tmp_path / "secrets"
+    folder.mkdir(exist_ok=True)
+    return (folder / "example_orch.yaml").as_posix()

@pytest.fixture
 def orchestration_file(orchestration_file_path):
@ -28,6 +30,7 @@ def autoarchiver(tmp_path, monkeypatch, request):
                logger.add(sys.stderr)

        request.addfinalizer(cleanup)
+        (tmp_path / "secrets").mkdir(exist_ok=True)

        # change dir to tmp_path
        monkeypatch.chdir(tmp_path)
@ -66,6 +69,7 @@ def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):

    # monkey patch to change the current working directory, so that we don't use the user's real config file
    monkeypatch.chdir(tmp_path)
+    (tmp_path / "secrets").mkdir(exist_ok=True)
    with monkeypatch.context() as m:
        m.setattr(sys, "argv", ["auto-archiver"])
        with pytest.raises(SystemExit):
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError
 from auto_archiver.core.orchestrator import ArchivingOrchestrator
 from auto_archiver.version import __version__
 from auto_archiver.core.config import read_yaml, store_yaml
-
+from auto_archiver.core import Metadata

 TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
 TEST_MODULES = "tests/data/test_modules/"
@ -160,4 +160,26 @@ def test_load_settings_for_module_from_commandline(orchestrator, test_args):

    assert len(orchestrator.feeders) == 1
    assert orchestrator.feeders[0].name == "gsheet_feeder"
-    assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
+    assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
+
+
+def test_multiple_orchestrator(test_args):
+
+    o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
+    o1 = ArchivingOrchestrator()
+
+    with pytest.raises(ValueError) as exit_error:
+        # this should fail because the gsheet_feeder requires a sheet_id / sheet
+        o1.setup(o1_args)
+
+
+
+    o2_args = test_args + ["--feeders", "example_module"]
+    o2 = ArchivingOrchestrator()
+    o2.setup(o2_args)
+
+    assert o2.feeders[0].name == "example_module"
+
+    output: Metadata = list(o2.feed())
+    assert len(output) == 1
+    assert output[0].get_url() == "https://example.com"
				`@ -1 +0,0 @@`
				`from .wacz_enricher import WaczExtractorEnricher`
				`@ -0,0 +1 @@`
				`from .wacz_extractor_enricher import WaczExtractorEnricher`