Merge branch 'load_modules' into timestamping_rewrite

2025-02-11 15:21:31 +00:00 · 2025-02-11 15:21:31 +00:00 · 7bb4d68a22
commit 7bb4d68a22
--- a/poetry.lock
+++ b/poetry.lock
@ -1025,7 +1025,7 @@ version = "0.7.3"
 description = "Python logging made (stupidly) simple"
 optional = false
 python-versions = "<4.0,>=3.5"
-groups = ["main"]
+groups = ["main", "dev"]
 files = [
    {file = "loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c"},
    {file = "loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6"},
@ -1750,6 +1750,24 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
 [package.extras]
 dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]

+[[package]]
+name = "pytest-loguru"
+version = "0.4.0"
+description = "Pytest Loguru"
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "pytest_loguru-0.4.0-py3-none-any.whl", hash = "sha256:3cc7b9c6b22cb158209ccbabf0d678dacd3f3c7497d6f46f1c338c13bee1ac77"},
+    {file = "pytest_loguru-0.4.0.tar.gz", hash = "sha256:0d9e4e72ae9bfd92f774c666e7353766af11b0b78edd59c290e89be116050f03"},
+]
+
+[package.dependencies]
+loguru = "*"
+
+[package.extras]
+test = ["pytest", "pytest-cov"]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@ -1818,7 +1836,7 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "docs"]
+groups = ["docs"]
 files = [
    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@ -2086,6 +2104,82 @@ files = [
 [package.dependencies]
 pyasn1 = ">=0.1.3"

+[[package]]
+name = "ruamel-yaml"
+version = "0.18.10"
+description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "ruamel.yaml-0.18.10-py3-none-any.whl", hash = "sha256:30f22513ab2301b3d2b577adc121c6471f28734d3d9728581245f1e76468b4f1"},
+    {file = "ruamel.yaml-0.18.10.tar.gz", hash = "sha256:20c86ab29ac2153f80a428e1254a8adf686d3383df04490514ca3b79a362db58"},
+]
+
+[package.dependencies]
+"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.13\""}
+
+[package.extras]
+docs = ["mercurial (>5.7)", "ryd"]
+jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"]
+
+[[package]]
+name = "ruamel-yaml-clib"
+version = "0.2.12"
+description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "platform_python_implementation == \"CPython\""
+files = [
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:11f891336688faf5156a36293a9c362bdc7c88f03a8a027c2c1d8e0bcde998e5"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
+    {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.11.2"
@ -2956,7 +3050,7 @@ version = "1.2.0"
 description = "A small Python utility to set file creation time on Windows"
 optional = false
 python-versions = ">=3.5"
-groups = ["main"]
+groups = ["main", "dev"]
 markers = "sys_platform == \"win32\""
 files = [
    {file = "win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390"},
@ -2983,14 +3077,14 @@ h11 = ">=0.9.0,<1"

 [[package]]
 name = "yt-dlp"
-version = "2025.1.12"
+version = "2025.1.26"
 description = "A feature-rich command-line audio/video downloader"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "yt_dlp-2025.1.12-py3-none-any.whl", hash = "sha256:f7ea19afb64f8e457a1b9598ddb67f8deaa313bf1d57abd5612db9272ab10795"},
-    {file = "yt_dlp-2025.1.12.tar.gz", hash = "sha256:8e7e246e2a5a2cff0a9c13db46844a37a547680702012058c94ec18fce0ca25a"},
+    {file = "yt_dlp-2025.1.26-py3-none-any.whl", hash = "sha256:3e76bd896b9f96601021ca192ca0fbdd195e3c3dcc28302a3a34c9bc4979da7b"},
+    {file = "yt_dlp-2025.1.26.tar.gz", hash = "sha256:1c9738266921ad43c568ad01ac3362fb7c7af549276fbec92bd72f140da16240"},
 ]

 [package.extras]
@ -3006,4 +3100,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "d1af74e7fc7c919eda55dd383208edab906508353b4a9eff8e979967484823f8"
+content-hash = "9ca114395e73af8982abbccc25b385bbca62e50ba7cca8239e52e5c1227cb4b0"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -37,7 +37,6 @@ dependencies = [
    "pdqhash (>=0.0.0)",
    "pillow (>=0.0.0)",
    "python-slugify (>=0.0.0)",
-    "pyyaml (>=0.0.0)",
    "dateparser (>=0.0.0)",
    "python-twitter-v2 (>=0.0.0)",
    "instaloader (>=0.0.0)",
@ -47,7 +46,7 @@ dependencies = [
    "cryptography (>=41.0.0,<42.0.0)",
    "boto3 (>=1.28.0,<2.0.0)",
    "dataclasses-json (>=0.0.0)",
-    "yt-dlp (==2025.1.12)",
+    "yt-dlp (>=2025.1.26,<2026.0.0)",
    "numpy (==2.1.3)",
    "vk-url-scraper (>=0.0.0)",
    "requests[socks] (>=0.0.0)",
@ -58,11 +57,13 @@ dependencies = [
    "tsp-client (>=0.0.0)",
    "certvalidator (>=0.0.0)",
    "rich-argparse (>=1.6.0,<2.0.0)",
+    "ruamel-yaml (>=0.18.10,<0.19.0)",
 ]

 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.4"
 autopep8 = "^2.3.1"
+pytest-loguru = "^0.4.0"

 [tool.poetry.group.docs.dependencies]
 sphinx = "^8.1.3"
--- a/scripts/create_update_gdrive_oauth_token.py
+++ b/scripts/create_update_gdrive_oauth_token.py
@ -12,7 +12,7 @@ from googleapiclient.errors import HttpError
 # Code below from https://developers.google.com/drive/api/quickstart/python
 # Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json

-SCOPES = ['https://www.googleapis.com/auth/drive']
+SCOPES = ["https://www.googleapis.com/auth/drive.file"]


@click.command(
@ -23,7 +23,7 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
    "-c",
    type=click.Path(exists=True),
    help="path to the credentials.json file downloaded from https://console.cloud.google.com/apis/credentials",
-    required=True
+    required=True,
 )
@click.option(
    "--token",
@ -31,59 +31,62 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
    type=click.Path(exists=False),
    default="gd-token.json",
    help="file where to place the OAuth token, defaults to gd-token.json which you must then move to where your orchestration file points to, defaults to gd-token.json",
-    required=True
+    required=True,
 )
 def main(credentials, token):
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first time.
    creds = None
    if os.path.exists(token):
-        with open(token, 'r') as stream:
+        with open(token, "r") as stream:
            creds_json = json.load(stream)
            # creds = Credentials.from_authorized_user_file(creds_json, SCOPES)
-            creds_json['refresh_token'] = creds_json.get("refresh_token", "")
+            creds_json["refresh_token"] = creds_json.get("refresh_token", "")
            creds = Credentials.from_authorized_user_info(creds_json, SCOPES)

    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
-            print('Requesting new token')
+            print("Requesting new token")
            creds.refresh(Request())
        else:
-            print('First run through so putting up login dialog')
+            print("First run through so putting up login dialog")
            # credentials.json downloaded from https://console.cloud.google.com/apis/credentials
            flow = InstalledAppFlow.from_client_secrets_file(credentials, SCOPES)
            creds = flow.run_local_server(port=55192)
        # Save the credentials for the next run
-        with open(token, 'w') as token:
-            print('Saving new token')
+        with open(token, "w") as token:
+            print("Saving new token")
            token.write(creds.to_json())
    else:
-        print('Token valid')
+        print("Token valid")

    try:
-        service = build('drive', 'v3', credentials=creds)
+        service = build("drive", "v3", credentials=creds)

        # About the user
        results = service.about().get(fields="*").execute()
-        emailAddress = results['user']['emailAddress']
+        emailAddress = results["user"]["emailAddress"]
        print(emailAddress)

        # Call the Drive v3 API and return some files
-        results = service.files().list(
-            pageSize=10, fields="nextPageToken, files(id, name)").execute()
-        items = results.get('files', [])
+        results = (
+            service.files()
+            .list(pageSize=10, fields="nextPageToken, files(id, name)")
+            .execute()
+        )
+        items = results.get("files", [])

        if not items:
-            print('No files found.')
+            print("No files found.")
            return
-        print('Files:')
+        print("Files:")
        for item in items:
-            print(u'{0} ({1})'.format(item['name'], item['id']))
+            print("{0} ({1})".format(item["name"], item["id"]))

    except HttpError as error:
-        print(f'An error occurred: {error}')
+        print(f"An error occurred: {error}")


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/scripts/telegram_setup.py
+++ b/scripts/telegram_setup.py
@ -0,0 +1,29 @@
+"""
+This script is used to create a new session file for the Telegram client.
+To do this you must first create a Telegram application at https://my.telegram.org/apps
+And store your id and hash in the environment variables TELEGRAM_API_ID and TELEGRAM_API_HASH.
+Create a .env file, or add the following to your environment :
+```
+export TELEGRAM_API_ID=[YOUR_ID_HERE]
+export TELEGRAM_API_HASH=[YOUR_HASH_HERE]
+```
+Then run this script to create a new session file.
+
+You will need to provide your phone number and a 2FA code the first time you run this script.
+"""
+
+
+import os
+from telethon.sync import TelegramClient
+from loguru import logger
+
+
+# Create a
+API_ID = os.getenv("TELEGRAM_API_ID")
+API_HASH = os.getenv("TELEGRAM_API_HASH")
+SESSION_FILE = "secrets/anon-insta"
+
+os.makedirs("secrets", exist_ok=True)
+with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
+    logger.success(f"New session file created: {SESSION_FILE}.session")
+
--- a/src/auto_archiver/main.py
+++ b/src/auto_archiver/main.py
@ -1,8 +1,9 @@
 """ Entry point for the auto_archiver package. """
 from auto_archiver.core.orchestrator import ArchivingOrchestrator
+import sys

 def main():
-    ArchivingOrchestrator().run()
+    ArchivingOrchestrator().run(sys.argv[1:])

 if __name__ == "__main__":
    main()
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@ -4,7 +4,6 @@
 from .metadata import Metadata
 from .media import Media
 from .module import BaseModule
-from .context import ArchivingContext

 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@ -0,0 +1,146 @@
+
+from urllib.parse import urlparse
+from typing import  Mapping, Any
+from abc import ABC
+from copy import deepcopy, copy
+from tempfile import TemporaryDirectory
+from auto_archiver.utils import url as UrlUtil
+
+from loguru import logger
+
+class BaseModule(ABC):
+
+    """
+    Base module class. All modules should inherit from this class.
+
+    The exact methods a class implements will depend on the type of module it is,
+    however modules can have a .setup() method to run any setup code
+    (e.g. logging in to a site, spinning up a browser etc.)
+
+    See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
+    a subclass can be of multiple types. For example, a module that extracts data from
+    a website and stores it in a database would be both an 'extractor' and a 'database' module.
+
+    Each module is a python package, and should have a __manifest__.py file in the
+    same directory as the module file. The __manifest__.py specifies the module information
+    like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
+    default manifest structure.
+
+    """
+
+    MODULE_TYPES = [
+        'feeder',
+        'extractor',
+        'enricher',
+        'database',
+        'storage',
+        'formatter'
+    ]
+
+    _DEFAULT_MANIFEST = {
+    'name': '', # the display name of the module
+    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
+    'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
+    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
+    'description': '', # a description of the module
+    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
+    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
+    'version': '1.0', # the version of the module
+    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
+}
+
+    config: Mapping[str, Any]
+    authentication: Mapping[str, Mapping[str, str]]
+    name: str
+
+    # this is set by the orchestrator prior to archiving
+    tmp_dir: TemporaryDirectory = None
+
+    @property
+    def storages(self) -> list:
+        return self.config.get('storages', [])
+
+    def config_setup(self, config: dict):
+
+        authentication = config.get('authentication', {})
+        # extract out concatenated sites
+        for key, val in copy(authentication).items():
+            if "," in key:
+                for site in key.split(","):
+                    authentication[site] = val
+                del authentication[key]
+
+        # this is important. Each instance is given its own deepcopied config, so modules cannot
+        # change values to affect other modules
+        config = deepcopy(config)
+        authentication = deepcopy(config.pop('authentication', {}))
+
+        self.authentication = authentication
+        self.config = config
+        for key, val in config.get(self.name, {}).items():
+            setattr(self, key, val)
+
+    def setup(self):
+        # For any additional setup required by modules, e.g. autehntication
+        pass
+
+    def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
+        """
+        Returns the authentication information for a given site. This is used to authenticate
+        with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
+        
+        extract_cookies: bool - whether or not to extract cookies from the given browser and return the 
+        cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
+
+        Currently, the dict can have keys of the following types:
+        - username: str - the username to use for login
+        - password: str - the password to use for login
+        - api_key: str - the API key to use for login
+        - api_secret: str - the API secret to use for login
+        - cookie: str - a cookie string to use for login (specific to this site)
+        - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
+        """
+        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
+        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
+
+        site = UrlUtil.domain_for_url(site)
+        # add the 'www' version of the site to the list of sites to check
+        authdict = {}
+
+
+        for to_try in [site, f"www.{site}"]:
+            if to_try in self.authentication:
+                authdict.update(self.authentication[to_try])
+                break
+
+        # do a fuzzy string match just to print a warning - don't use it since it's insecure
+        if not authdict:
+            for key in self.authentication.keys():
+                if key in site or site in key:
+                    logger.debug(f"Could not find exact authentication information for site '{site}'. \
+                                    did find information for '{key}' which is close, is this what you meant? \
+                                    If so, edit your authentication settings to make sure it exactly matches.")
+
+        def get_ytdlp_cookiejar(args):
+            import yt_dlp
+            from yt_dlp import parse_options
+            logger.debug(f"Extracting cookies from settings: {args[1]}")
+            # parse_options returns a named tuple as follows, we only need the ydl_options part
+            # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
+            ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
+            return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
+
+        # get the cookies jar, prefer the browser cookies than the file
+        if 'cookies_from_browser' in self.authentication:
+            authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
+            if extract_cookies:
+                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
+        elif 'cookies_file' in self.authentication:
+            authdict['cookies_file'] = self.authentication['cookies_file']
+            if extract_cookies:
+                authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
+        
+        return authdict
+    
+    def repr(self):
+        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -11,20 +11,39 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
 from loguru import logger

 from copy import deepcopy
-from .module import MODULE_TYPES
+from .module import BaseModule

 from typing import Any, List, Type, Tuple

-yaml = YAML()
+_yaml: YAML = YAML()

-EMPTY_CONFIG = yaml.load("""
+EMPTY_CONFIG = _yaml.load("""
 # Auto Archiver Configuration
 # Steps are the modules that will be run in the order they are defined

-steps:""" + "".join([f"\n   {module}s: []" for module in MODULE_TYPES]) + \
+steps:""" + "".join([f"\n   {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
 """

 # Global configuration
+
+# Authentication
+# a dictionary of authentication information that can be used by extractors to login to website. 
+# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
+# Common login 'types' are username/password, cookie, api key/token.
+# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. 
+# Some Examples:
+# facebook.com:
+#   username: "my_username"
+#   password: "my_password"
+# or for a site that uses an API key:
+# twitter.com,x.com:
+#   api_key
+#   api_secret
+# youtube.com:
+#   cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
+
+authentication: {}
+
 # These are the global configurations that are used by the modules

 logging:
@ -48,6 +67,10 @@ class DefaultValidatingParser(argparse.ArgumentParser):
        """
        for action in self._actions:
            if not namespace or action.dest not in namespace:
+                # for actions that are required and already have a default value, remove the 'required' check
+                if action.required and action.default is not None:
+                    action.required = False
+
                if action.default is not None:
                    try:
                        self._check_value(action, action.default)
@ -120,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
    config = None
    try:
        with open(yaml_filename, "r", encoding="utf-8") as inf:
-            config = yaml.load(inf)
+            config = _yaml.load(inf)
    except FileNotFoundError:
        pass

@ -132,12 +155,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
 # TODO: make this tidier/find a way to notify of which keys should not be stored


-def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None:
+def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
    config_to_save = deepcopy(config)

-    for key1, key2 in do_not_store_keys:
-        if key1 in config_to_save and key2 in config_to_save[key1]:
-            del config_to_save[key1][key2]
-
+    config_to_save.pop('urls', None)
    with open(yaml_filename, "w", encoding="utf-8") as outf:
-        yaml.dump(config_to_save, outf)
+        _yaml.dump(config_to_save, outf)
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@ -1,64 +0,0 @@
-""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
-
-This singleton class allows for:
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
- Marking certain values to persist across resets using `keep_on_reset`.
- Managing temporary directories and other shared data used during the archiving process.
-
-### Key Features:
- Creates a single global instance.
- Reset functionality allows for clearing configurations, with options for partial or full resets.
- Custom getters and setters for commonly used context values like temporary directories.
-
-"""
-
-class ArchivingContext:
-    """
-    Singleton context class for managing global configurations and temporary data.
-
-    ArchivingContext._get_instance() to retrieve it if needed
-    otherwise just 
-    ArchivingContext.set(key, value)
-    and 
-    ArchivingContext.get(key, default)
-
-    When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
-        reset(full_reset=True) will recreate everything including the keep_on_reset status
-    """
-    _instance = None
-
-    def __init__(self):
-        self.configs = {}
-        self.keep_on_reset = set()
-
-    @staticmethod
-    def get_instance():
-        if ArchivingContext._instance is None:
-            ArchivingContext._instance = ArchivingContext()
-        return ArchivingContext._instance
-
-    @staticmethod
-    def set(key, value, keep_on_reset: bool = False):
-        ac = ArchivingContext.get_instance()
-        ac.configs[key] = value
-        if keep_on_reset: ac.keep_on_reset.add(key)
-
-    @staticmethod
-    def get(key: str, default=None):
-        return ArchivingContext.get_instance().configs.get(key, default)
-
-    @staticmethod
-    def reset(full_reset: bool = False):
-        ac = ArchivingContext.get_instance()
-        if full_reset: ac.keep_on_reset = set()
-        ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
-
-    # ---- custom getters/setters for widely used context values
-
-    @staticmethod
-    def set_tmp_dir(tmp_dir: str):
-        ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
-
-    @staticmethod
-    def get_tmp_dir() -> str:
-        return ArchivingContext.get_instance().configs.get("tmp_dir")
--- a/src/auto_archiver/core/database.py
+++ b/src/auto_archiver/core/database.py
@ -1,12 +1,9 @@
 from __future__ import annotations
-from dataclasses import dataclass
-from abc import abstractmethod, ABC
+from abc import abstractmethod
 from typing import Union

 from auto_archiver.core import Metadata, BaseModule

-
-@dataclass
 class Database(BaseModule):

    def started(self, item: Metadata) -> None:
--- a/src/auto_archiver/core/enricher.py
+++ b/src/auto_archiver/core/enricher.py
@ -9,11 +9,9 @@ the archiving step and before storage or formatting.
 Enrichers are optional but highly useful for making the archived data more powerful.
 """
 from __future__ import annotations
-from dataclasses import dataclass
-from abc import abstractmethod, ABC
+from abc import abstractmethod
 from auto_archiver.core import Metadata, BaseModule

-@dataclass
 class Enricher(BaseModule):
    """Base classes and utilities for enrichers in the Auto-Archiver system."""

--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@ -11,20 +11,23 @@ from abc import abstractmethod
 from dataclasses import dataclass
 import mimetypes
 import os
-import mimetypes, requests
+import mimetypes
+import requests
 from loguru import logger
 from retrying import retry
+import re

-from ..core import Metadata, ArchivingContext, BaseModule
+from auto_archiver.core import Metadata, BaseModule


-@dataclass
 class Extractor(BaseModule):
    """
    Base class for implementing extractors in the media archiving framework.
    Subclasses must implement the `download` method to define platform-specific behavior.
    """

+    valid_url: re.Pattern = None
+
    def cleanup(self) -> None:
        # called when extractors are done, or upon errors, cleanup any resources
        pass
@ -32,13 +35,20 @@ class Extractor(BaseModule):
    def sanitize_url(self, url: str) -> str:
        # used to clean unnecessary URL parameters OR unfurl redirect links
        return url
+    
+    def match_link(self, url: str) -> re.Match:
+        return self.valid_url.match(url)

    def suitable(self, url: str) -> bool:
        """
        Returns True if this extractor can handle the given URL

        Should be overridden by subclasses
+
        """
+        if self.valid_url:
+            return self.match_link(url) is not None
+        
        return True

    def _guess_file_type(self, path: str) -> str:
@ -60,7 +70,7 @@ class Extractor(BaseModule):
            to_filename = url.split('/')[-1].split('?')[0]
            if len(to_filename) > 64:
                to_filename = to_filename[-64:]
-        to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
+        to_filename = os.path.join(self.tmp_dir, to_filename)
        if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
@ -85,5 +95,11 @@ class Extractor(BaseModule):
            logger.warning(f"Failed to fetch the Media URL: {e}")

    @abstractmethod
-    def download(self, item: Metadata) -> Metadata:
+    def download(self, item: Metadata) -> Metadata | False:
+        """
+        Downloads the media from the given URL and returns a Metadata object with the downloaded media.
+        
+        If the URL is not supported or the download fails, this method should return False.
+
+        """
        pass
--- a/src/auto_archiver/core/feeder.py
+++ b/src/auto_archiver/core/feeder.py
@ -1,11 +1,8 @@
 from __future__ import annotations
-from dataclasses import dataclass
 from abc import abstractmethod
 from auto_archiver.core import Metadata
 from auto_archiver.core import BaseModule

-
-@dataclass
 class Feeder(BaseModule):

    @abstractmethod
--- a/src/auto_archiver/core/formatter.py
+++ b/src/auto_archiver/core/formatter.py
@ -1,10 +1,8 @@
 from __future__ import annotations
-from dataclasses import dataclass
 from abc import abstractmethod
 from auto_archiver.core import Metadata, Media, BaseModule


-@dataclass
 class Formatter(BaseModule):

    @abstractmethod
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@ -11,8 +11,6 @@ from dataclasses import dataclass, field
 from dataclasses_json import dataclass_json, config
 import mimetypes

-from .context import ArchivingContext
-
 from loguru import logger


@ -36,12 +34,11 @@ class Media:
    _mimetype: str = None  # eg: image/jpeg
    _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True))  # always exclude

-    def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
+    def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
        # 'Any' typing for metadata to avoid circular imports. Stores the media
        # into the provided/available storages [Storage] repeats the process for
        # its properties, in case they have inner media themselves for now it
        # only goes down 1 level but it's easy to make it recursive if needed.
-        storages = override_storages or ArchivingContext.get("storages")
        if not len(storages):
            logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
            return
@ -66,8 +63,9 @@ class Media:
                        for inner_media in prop_media.all_inner_media(include_self=True):
                            yield inner_media

-    def is_stored(self) -> bool:
-        return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
+    def is_stored(self, in_storage) -> bool:
+        # checks if the media is already stored in the given storage
+        return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])

    def set(self, key: str, value: Any) -> Media:
        self.properties[key] = value
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
 from loguru import logger

 from .media import Media
-from .context import ArchivingContext
-

@dataclass_json  # annotation order matters
@dataclass
@ -32,6 +30,7 @@ class Metadata:

    def __post_init__(self):
        self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
+        self._context = {}

    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
@ -45,6 +44,7 @@ class Metadata:
        if overwrite_left:
            if right.status and len(right.status):
                self.status = right.status
+            self._context.update(right._context)
            for k, v in right.metadata.items():
                assert k not in self.metadata or type(v) == type(self.get(k))
                if type(v) not in [dict, list, set] or k not in self.metadata:
@ -57,12 +57,11 @@ class Metadata:
            return right.merge(self)
        return self

-    def store(self: Metadata, override_storages: List = None):
+    def store(self, storages=[]):
        # calls .store for all contained media. storages [Storage]
        self.remove_duplicate_media_by_hash()
-        storages = override_storages or ArchivingContext.get("storages")
        for media in self.media:
-            media.store(override_storages=storages, url=self.get_url(), metadata=self)
+            media.store(url=self.get_url(), metadata=self, storages=storages)

    def set(self, key: str, val: Any) -> Metadata:
        self.metadata[key] = val
@ -206,3 +205,10 @@ class Metadata:
            if len(r.media) > len(most_complete.media): most_complete = r
            elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
        return most_complete
+
+    def set_context(self, key: str, val: Any) -> Metadata:
+        self._context[key] = val
+        return self
+    
+    def get_context(self, key: str, default: Any = None) -> Any:
+        return self._context.get(key, default)
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@ -7,59 +7,70 @@ from __future__ import annotations

 from dataclasses import dataclass
 from typing import List
-from abc import ABC
 import shutil
 import ast
 import copy
 import sys
 from importlib.util import find_spec
 import os
-from os.path import join, dirname
+from os.path import join
 from loguru import logger
+import auto_archiver
+from .base_module import BaseModule

 _LAZY_LOADED_MODULES = {}

-MODULE_TYPES = [
-    'feeder',
-    'extractor',
-    'enricher',
-    'database',
-    'storage',
-    'formatter'
-]
-
 MANIFEST_FILE = "__manifest__.py"
-_DEFAULT_MANIFEST = {
-    'name': '',
-    'author': 'Bellingcat',
-    'type': [],
-    'requires_setup': True,
-    'description': '',
-    'dependencies': {},
-    'entry_point': '',
-    'version': '1.0',
-    'configs': {}
-}

-class BaseModule(ABC):

-    config: dict
-    name: str
+def setup_paths(paths: list[str]) -> None:
+    """
+    Sets up the paths for the modules to be loaded from
+    
+    This is necessary for the modules to be imported correctly
+    
+    """
+    for path in paths:
+        # check path exists, if it doesn't, log a warning
+        if not os.path.exists(path):
+            logger.warning(f"Path '{path}' does not exist. Skipping...")
+            continue

-    def setup(self, config: dict):
-        self.config = config
-        for key, val in config.get(self.name, {}).items():
-            setattr(self, key, val)
+        # see odoo/module/module.py -> initialize_sys_path
+        if path not in auto_archiver.modules.__path__:
+                auto_archiver.modules.__path__.append(path)

-def get_module(module_name: str, additional_paths: List[str] = []) -> LazyBaseModule:
+    # sort based on the length of the path, so that the longest path is last in the list
+    auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
+
+def get_module(module_name: str, config: dict) -> BaseModule:
+    """
+    Gets and sets up a module using the provided config
+    
+    This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
+    
+    """
+    return get_module_lazy(module_name).load(config)
+
+def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
+    """
+    Lazily loads a module, returning a LazyBaseModule
+    
+    This has all the information about the module, but does not load the module itself or its dependencies
+    
+    To load an actual module, call .setup() on a lazy module
+    
+    """
    if module_name in _LAZY_LOADED_MODULES:
        return _LAZY_LOADED_MODULES[module_name]

-    module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0]
-    _LAZY_LOADED_MODULES[module_name] = module
-    return module
+    available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
+    if not available:
+        raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
+    return available[0]

-def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
+def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
+    
    # search through all valid 'modules' paths. Default is 'modules' in the current directory

    # see odoo/modules/module.py -> get_modules
@ -67,10 +78,9 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
        if os.path.isfile(join(module_path, MANIFEST_FILE)):
            return True

-    default_path = [join(dirname(dirname((__file__))), "modules")]
    all_modules = []

-    for module_folder in default_path + additional_paths:
+    for module_folder in auto_archiver.modules.__path__:
        # walk through each module in module_folder and check if it has a valid manifest
        try:
            possible_modules = os.listdir(module_folder)
@ -85,8 +95,13 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
            possible_module_path = join(module_folder, possible_module)
            if not is_really_module(possible_module_path):
                continue
-                
-            all_modules.append(LazyBaseModule(possible_module, possible_module_path))
+            if _LAZY_LOADED_MODULES.get(possible_module):
+                continue
+            lazy_module = LazyBaseModule(possible_module, possible_module_path)
+
+            _LAZY_LOADED_MODULES[possible_module] = lazy_module
+
+            all_modules.append(lazy_module)
    
    if not suppress_warnings:
        for module in limit_to_modules:
@ -97,8 +112,14 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []

@dataclass
 class LazyBaseModule:
+
+    """
+    A lazy module class, which only loads the manifest and does not load the module itself.
+
+    This is useful for getting information about a module without actually loading it.
+
+    """
    name: str
-    display_name: str
    type: list
    description: str
    path: str
@ -129,6 +150,10 @@ class LazyBaseModule:
    @property
    def requires_setup(self) -> bool:
        return self.manifest['requires_setup']
+    
+    @property
+    def display_name(self) -> str:
+        return self.manifest['name']

    @property
    def manifest(self) -> dict:
@ -136,7 +161,7 @@ class LazyBaseModule:
            return self._manifest
        # print(f"Loading manifest for module {module_path}")
        # load the manifest file
-        manifest = copy.deepcopy(_DEFAULT_MANIFEST)
+        manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)

        with open(join(self.path, MANIFEST_FILE)) as f:
            try:
@ -145,7 +170,6 @@ class LazyBaseModule:
                logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
            
        self._manifest = manifest
-        self.display_name = manifest['name']
        self.type = manifest['type']
        self._entry_point = manifest['entry_point']
        self.description = manifest['description']
@ -153,7 +177,7 @@ class LazyBaseModule:

        return manifest

-    def load(self) -> BaseModule:
+    def load(self, config) -> BaseModule:

        if self._instance:
            return self._instance
@ -161,11 +185,31 @@ class LazyBaseModule:
        # check external dependencies are installed
        def check_deps(deps, check):
            for dep in deps:
+                if not len(dep):
+                    # clear out any empty strings that a user may have erroneously added
+                    continue
                if not check(dep):
-                    logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
+                    logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
                    exit(1)

-        check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep))
+        def check_python_dep(dep):
+            # first check if it's a module:
+            try:
+                m = get_module_lazy(dep, suppress_warnings=True)
+                try:
+                # we must now load this module and set it up with the config
+                    m.load(config)
+                    return True
+                except:
+                    logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
+                    return False
+            except IndexError:
+                # not a module, continue
+                pass
+
+            return find_spec(dep)
+
+        check_deps(self.dependencies.get('python', []), check_python_dep)
        check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))


@ -184,9 +228,8 @@ class LazyBaseModule:
        sub_qualname = f'{qualname}.{file_name}'

        __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
-
        # finally, get the class instance
-        instance = getattr(sys.modules[sub_qualname], class_name)()
+        instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
        if not getattr(instance, 'name', None):
            instance.name = self.name

@ -194,6 +237,12 @@ class LazyBaseModule:
            instance.display_name = self.display_name

        self._instance = instance
+
+        # merge the default config with the user config
+        default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
+        config[self.name] = default_config  | config.get(self.name, {})
+        instance.config_setup(config)
+        instance.setup()
        return instance

    def __repr__(self):
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -5,30 +5,61 @@
 """

 from __future__ import annotations
-from typing import Generator, Union, List
+from typing import Generator, Union, List, Type
 from urllib.parse import urlparse
 from ipaddress import ip_address
 import argparse
 import os
 import sys
+import json
+from tempfile import TemporaryDirectory
+import traceback

 from rich_argparse import RichHelpFormatter

-from .context import ArchivingContext

-from .metadata import Metadata
-from ..version import __version__
-from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
-from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module
-from . import validators
+from .metadata import Metadata, Media
+from auto_archiver.version import __version__
+from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .module import available_modules, LazyBaseModule, get_module, setup_paths
+from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .module import BaseModule

-import tempfile, traceback
 from loguru import logger


 DEFAULT_CONFIG_FILE = "orchestration.yaml"

+class JsonParseAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        try:
+            setattr(namespace, self.dest, json.loads(values))
+        except json.JSONDecodeError as e:
+            raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
+
+
+class AuthenticationJsonParseAction(JsonParseAction):
+    def __call__(self, parser, namespace, values, option_string=None):
+        super().__call__(parser, namespace, values, option_string)
+        auth_dict = getattr(namespace, self.dest)
+        if isinstance(auth_dict, str):
+            # if it's a string
+            try:
+                with open(auth_dict, 'r') as f:
+                    try:
+                        auth_dict = json.load(f)
+                    except json.JSONDecodeError:
+                        # maybe it's yaml, try that
+                        auth_dict = _yaml.load(f)
+            except:
+                pass
+
+        if not isinstance(auth_dict, dict):
+            raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        for site, auth in auth_dict.items():
+            if not isinstance(site, str) or not isinstance(auth, dict):
+                raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
+        setattr(namespace, self.dest, auth_dict)
 class UniqueAppendAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        if not hasattr(namespace, self.dest):
@ -39,10 +70,16 @@ class UniqueAppendAction(argparse.Action):

 class ArchivingOrchestrator:

-    _do_not_store_keys = []
-
+    feeders: List[Type[Feeder]]
+    extractors: List[Type[Extractor]]
+    enrichers: List[Type[Enricher]]
+    databases: List[Type[Database]]
+    storages: List[Type[Storage]]
+    formatters: List[Type[Formatter]]
+    
    def setup_basic_parser(self):
        parser = argparse.ArgumentParser(
+                prog="auto-archiver",
                add_help=False,
                description="""
                Auto Archiver is a CLI tool to archive media/metadata from online URLs;
@ -51,14 +88,16 @@ class ArchivingOrchestrator:
                epilog="Check the code at https://github.com/bellingcat/auto-archiver",
                formatter_class=RichHelpFormatter,
        )
-        parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
+        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
        parser.add_argument('--version', action='version', version=__version__)
+        parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
        parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
        # override the default 'help' so we can inject all the configs and show those
-        parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
        parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
+        parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)

        self.basic_parser = parser
+        return parser

    def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
        parser = DefaultValidatingParser(
@ -76,18 +115,22 @@ class ArchivingOrchestrator:
            # only load the modules enabled in config
            # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
            enabled_modules = []
-            for module_type in MODULE_TYPES:
-                enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
+            # first loads the modules from the config file, then from the command line
+            for config in [yaml_config['steps'], basic_config.__dict__]:
+                for module_type in BaseModule.MODULE_TYPES:
+                    enabled_modules.extend(config.get(f"{module_type}s", []))

-            # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
-            for module_type in MODULE_TYPES:
-                if modules := getattr(basic_config, f"{module_type}s", []):
-                    enabled_modules.extend(modules)
-
-            self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules), suppress_warnings=True), parser)
+            # clear out duplicates, but keep the order
+            enabled_modules = list(dict.fromkeys(enabled_modules))
+            avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
+            self.add_module_args(avail_modules, parser)
        elif basic_config.mode == 'simple':
            simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
            self.add_module_args(simple_modules, parser)
+
+            # for simple mode, we use the cli_feeder and any modules that don't require setup
+            yaml_config['steps']['feeders'] = ['cli_feeder']
+            
            # add them to the config
            for module in simple_modules:
                for module_type in module.type:
@ -115,7 +158,7 @@ class ArchivingOrchestrator:
        
        if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
            logger.info(f"Storing configuration file to {basic_config.config_file}")
-            store_yaml(self.config, basic_config.config_file, self._do_not_store_keys)
+            store_yaml(self.config, basic_config.config_file)
        
        return self.config
    
@ -123,28 +166,37 @@ class ArchivingOrchestrator:
        if not parser:
            parser = self.parser

-        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction)
+
+        # allow passing URLs directly on the command line
+        parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
+
+        parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
        parser.add_argument('--enrichers', dest='steps.enrichers',  nargs='+', help='the enrichers to use', action=UniqueAppendAction)
        parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
        parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
        parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
        parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)

+        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
+                                                                            (token, username etc.) that extractors can use to log into \
+                                                                            a website. If passing this on the command line, use a JSON string. \
+                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',\
+                                                                            default={},
+                                                                            action=AuthenticationJsonParseAction)
        # logging arguments
-        parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO')
+        parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
        parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
        parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)

-        # additional modules
-        parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction)

-    def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None):
+    def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:

        if not modules:
            modules = available_modules(with_manifest=True)

        module: LazyBaseModule
        for module in modules:
+
            if not module.configs:
                # this module has no configs, don't show anything in the help
                # (TODO: do we want to show something about this module though, like a description?)
@ -153,54 +205,54 @@ class ArchivingOrchestrator:
            group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")

            for name, kwargs in module.configs.items():
-                # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
-                # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
-                do_not_store = kwargs.pop('do_not_store', False)
-                if do_not_store:
-                    self._do_not_store_keys.append((module.name, name))
-                
                if not kwargs.get('metavar', None):
                    # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
                    kwargs['metavar'] = name.upper()

+                if kwargs.get('required', False):
+                    # required args shouldn't have a 'default' value, remove it
+                    kwargs.pop('default', None)
+
                kwargs.pop('cli_set', None)
                should_store = kwargs.pop('should_store', False)
                kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
                try:
+                    kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
+                except AttributeError:
                    kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
-                except KeyError:
-                    kwargs['type'] = getattr(validators, kwargs['type'])
                arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
                arg.should_store = should_store

-    def show_help(self):
+    def show_help(self, basic_config: dict):
        # for the help message, we want to load *all* possible modules and show the help
            # add configs as arg parser arguments
        
        self.add_additional_args(self.basic_parser)
        self.add_module_args(parser=self.basic_parser)
-
        self.basic_parser.print_help()
-        exit()
+        self.basic_parser.exit()
    
    def setup_logging(self):
        # setup loguru logging
-        logger.remove() # remove the default logger
+        logger.remove(0) # remove the default logger
        logging_config = self.config['logging']
        logger.add(sys.stderr, level=logging_config['level'])
        if log_file := logging_config['file']:
            logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])

-        
-    def install_modules(self):
+    def install_modules(self, modules_by_type):
        """
-        Swaps out the previous 'strings' in the config with the actual modules
+        Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the 
+        orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
+        are loaded, the program will exit with an error message.
        """
        
        invalid_modules = []
-        for module_type in MODULE_TYPES:
+        for module_type in BaseModule.MODULE_TYPES:
+
            step_items = []
-            modules_to_load = self.config['steps'][f"{module_type}s"]
+            modules_to_load = modules_by_type[f"{module_type}s"]
+            assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"

            def check_steps_ok():
                if not len(step_items):
@ -214,14 +266,37 @@ class ArchivingOrchestrator:
                    exit()

            for module in modules_to_load:
+                if module == 'cli_feeder':
+                    # pseudo module, don't load it
+                    urls = self.config['urls']
+                    if not urls:
+                        logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
+                        exit()
+                    # cli_feeder is a pseudo module, it just takes the command line args
+                    def feed(self) -> Generator[Metadata]:
+                        for url in urls:
+                            logger.debug(f"Processing URL: '{url}'")
+                            yield Metadata().set_url(url)
+
+                    pseudo_module = type('CLIFeeder', (Feeder,), {
+                        'name': 'cli_feeder',
+                        'display_name': 'CLI Feeder',
+                        '__iter__': feed
+
+                    })()
+  
+
+                    pseudo_module.__iter__ = feed
+                    step_items.append(pseudo_module)
+                    continue
+
                if module in invalid_modules:
                    continue
-                loaded_module: BaseModule = get_module(module).load()
                try:
-                    loaded_module.setup(self.config)
+                    loaded_module: BaseModule = get_module(module, self.config)
                except (KeyboardInterrupt, Exception) as e:
-                    logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
-                    if module_type == 'extractor':
+                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
+                    if module_type == 'extractor' and loaded_module.name == module:
                        loaded_module.cleanup()
                    exit()

@ -230,59 +305,58 @@ class ArchivingOrchestrator:
                    continue
                if loaded_module:
                    step_items.append(loaded_module)
-                    # TODO temp solution
-                    if module_type == "storage":
-                        ArchivingContext.set("storages", step_items, keep_on_reset=True)

            check_steps_ok()
-            self.config['steps'][f"{module_type}s"] = step_items
-            
+            setattr(self, f"{module_type}s", step_items)
+    
+    def load_config(self, config_file: str) -> dict:
+        if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
+            logger.error(f"The configuration file {config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
+            exit()

-            assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again."
-            self.config['steps'][f"{module_type}s"] = step_items
+        return read_yaml(config_file)

-    def run(self) -> None:
+    def run(self, args: list) -> None:
+        
        self.setup_basic_parser()

        # parse the known arguments for now (basically, we want the config file)
+        basic_config, unused_args = self.basic_parser.parse_known_args(args)

-        # load the config file to get the list of enabled items
-        basic_config, unused_args = self.basic_parser.parse_known_args()
+        # setup any custom module paths, so they'll show in the help and for arg parsing
+        setup_paths(basic_config.module_paths)

        # if help flag was called, then show the help
        if basic_config.help:
-            self.show_help()
+            self.show_help(basic_config)

-        # load the config file
-        yaml_config = {}
-
-        if not os.path.exists(basic_config.config_file) and basic_config.config_file != DEFAULT_CONFIG_FILE:
-            logger.error(f"The configuration file {basic_config.config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
-            exit()
-
-
-        yaml_config = read_yaml(basic_config.config_file)
+        yaml_config = self.load_config(basic_config.config_file)
        self.setup_complete_parser(basic_config, yaml_config, unused_args)

        logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
-        self.install_modules()
+        self.install_modules(self.config['steps'])

        # log out the modules that were loaded
-        for module_type in MODULE_TYPES:
-            logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"]))
+        for module_type in BaseModule.MODULE_TYPES:
+            logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))

-        for item in self.feed():
+        for _ in self.feed():
            pass

    def cleanup(self)->None:
        logger.info("Cleaning up")
-        for e in self.config['steps']['extractors']:
+        for e in self.extractors:
            e.cleanup()

    def feed(self) -> Generator[Metadata]:
-        for feeder in self.config['steps']['feeders']:
+
+        url_count = 0
+        for feeder in self.feeders:
            for item in feeder:
                yield self.feed_item(item)
+                url_count += 1
+
+        logger.success(f"Processed {url_count} URL(s)")
        self.cleanup()

    def feed_item(self, item: Metadata) -> Metadata:
@ -291,22 +365,33 @@ class ArchivingOrchestrator:
            - catches keyboard interruptions to do a clean exit
            - catches any unexpected error, logs it, and does a clean exit
        """
+        tmp_dir: TemporaryDirectory = None
        try:
-            ArchivingContext.reset()
-            with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
-                ArchivingContext.set_tmp_dir(tmp_dir)
-                return self.archive(item)
+            tmp_dir = TemporaryDirectory(dir="./")
+            # set tmp_dir on all modules
+            for m in self.all_modules:
+                m.tmp_dir = tmp_dir.name
+            return self.archive(item)
        except KeyboardInterrupt:
            # catches keyboard interruptions to do a clean exit
            logger.warning(f"caught interrupt on {item=}")
-            for d in self.config['steps']['databases']: d.aborted(item)
+            for d in self.databases:
+                d.aborted(item)
            self.cleanup()
            exit()
        except Exception as e:
            logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
-            for d in self.config['steps']['databases']:
-                if type(e) == AssertionError: d.failed(item, str(e))
-                else: d.failed(item, reason="unexpected error")
+            for d in self.databases:
+                if type(e) == AssertionError:
+                    d.failed(item, str(e))
+                else:
+                    d.failed(item, reason="unexpected error")
+        finally:
+            if tmp_dir:
+                # remove the tmp_dir from all modules
+                for m in self.all_modules:
+                    m.tmp_dir = None
+                tmp_dir.cleanup()


    def archive(self, result: Metadata) -> Union[Metadata, None]:
@ -319,31 +404,38 @@ class ArchivingOrchestrator:
            5. Store all downloaded/generated media
            6. Call selected Formatter and store formatted if needed
        """
+
        original_url = result.get_url().strip()
-        self.assert_valid_url(original_url)
+        try:
+            self.assert_valid_url(original_url)
+        except AssertionError as e:
+            logger.error(f"Error archiving URL {original_url}: {e}")
+            raise e

        # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
        url = original_url
-        for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url)
+        for a in self.extractors:
+            url = a.sanitize_url(url)
+
        result.set_url(url)
        if original_url != url: result.set("original_url", original_url)

        # 2 - notify start to DBs, propagate already archived if feature enabled in DBs
        cached_result = None
-        for d in self.config["steps"]["databases"]:
+        for d in self.databases:
            d.started(result)
-            if (local_result := d.fetch(result)):
-                cached_result = (cached_result or Metadata()).merge(local_result)
+            if local_result := d.fetch(result):
+                cached_result = (cached_result or Metadata()).merge(local_result).merge(result)
        if cached_result:
            logger.debug("Found previously archived entry")
-            for d in self.config["steps"]["databases"]:
+            for d in self.databases:
                try: d.done(cached_result, cached=True)
                except Exception as e:
                    logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
            return cached_result

        # 3 - call extractors until one succeeds
-        for a in self.config["steps"]["extractors"]:
+        for a in self.extractors:
            logger.info(f"Trying extractor {a.name} for {url}")
            try:
                result.merge(a.download(result))
@ -352,24 +444,25 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")

        # 4 - call enrichers to work with archived content
-        for e in self.config["steps"]["enrichers"]:
+        for e in self.enrichers:
            try: e.enrich(result)
            except Exception as exc: 
                logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")

        # 5 - store all downloaded/generated media
-        result.store()
+        result.store(storages=self.storages)

        # 6 - format and store formatted if needed
-        if final_media := self.config["steps"]["formatters"][0].format(result):
-            final_media.store(url=url, metadata=result)
+        final_media: Media
+        if final_media := self.formatters[0].format(result):
+            final_media.store(url=url, metadata=result, storages=self.storages)
            result.set_final_media(final_media)

        if result.is_empty():
            result.status = "nothing archived"

        # signal completion to databases and archivers
-        for d in self.config["steps"]["databases"]:
+        for d in self.databases:
            try: d.done(result)
            except Exception as e:
                logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
@ -394,4 +487,11 @@ class ArchivingOrchestrator:
            assert ip.is_global, f"Invalid IP used"
            assert not ip.is_reserved, f"Invalid IP used"
            assert not ip.is_link_local, f"Invalid IP used"
-            assert not ip.is_private, f"Invalid IP used"
+            assert not ip.is_private, f"Invalid IP used"
+
+
+    # Helper Properties
+    
+    @property
+    def all_modules(self) -> List[Type[BaseModule]]:
+        return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@ -1,25 +1,23 @@
 from __future__ import annotations
 from abc import abstractmethod
-from dataclasses import dataclass
-from typing import IO, Optional
+from typing import IO
 import os

-from auto_archiver.utils.misc import random_str
-
-from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
-from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
 from loguru import logger
 from slugify import slugify

+from auto_archiver.utils.misc import random_str

-@dataclass
+from auto_archiver.core import Media, BaseModule, Metadata
+from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
+from auto_archiver.core.module import get_module
 class Storage(BaseModule):

-    def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
-        if media.is_stored(): 
+    def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
+        if media.is_stored(in_storage=self): 
            logger.debug(f"{media.key} already stored, skipping")
            return
-        self.set_key(media, url)
+        self.set_key(media, url, metadata)
        self.upload(media, metadata=metadata)
        media.add_url(self.get_cdn_url(media))

@ -30,34 +28,35 @@ class Storage(BaseModule):
    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass

    def upload(self, media: Media, **kwargs) -> bool:
-        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
+        logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
        with open(media.filename, 'rb') as f:
            return self.uploadf(f, media, **kwargs)

-    def set_key(self, media: Media, url) -> None:
+    def set_key(self, media: Media, url, metadata: Metadata) -> None:
        """takes the media and optionally item info and generates a key"""
        if media.key is not None and len(media.key) > 0: return
-        folder = ArchivingContext.get("folder", "")
+        folder = metadata.get_context('folder', '')
        filename, ext = os.path.splitext(media.filename)

        # Handle path_generator logic
-        path_generator = ArchivingContext.get("path_generator", "url")
+        path_generator = self.config.get("path_generator", "url")
        if path_generator == "flat":
            path = ""
            filename = slugify(filename)  # Ensure filename is slugified
        elif path_generator == "url":
            path = slugify(url)
        elif path_generator == "random":
-            path = ArchivingContext.get("random_path", random_str(24), True)
+            path = self.config.get("random_path", random_str(24), True)
        else:
            raise ValueError(f"Invalid path_generator: {path_generator}")

        # Handle filename_generator logic
-        filename_generator = ArchivingContext.get("filename_generator", "random")
+        filename_generator = self.config.get("filename_generator", "random")
        if filename_generator == "random":
            filename = random_str(24)
        elif filename_generator == "static":
-            he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
+            # load the hash_enricher module
+            he = get_module(HashEnricher, self.config)
            hd = he.calculate_hash(media.filename)
            filename = hd[:24]
        else:
--- a/src/auto_archiver/core/validators.py
+++ b/src/auto_archiver/core/validators.py
@ -1,7 +1,19 @@
-# used as validators for config values.
+# used as validators for config values. Should raise an exception if the value is invalid.
+from pathlib import Path
+import argparse

 def example_validator(value):
-    return "example" in value
+    if "example" not in value:
+        raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
+    return value

 def positive_number(value):
-    return value > 0
+    if value < 0:
+        raise argparse.ArgumentTypeError(f"{value} is not a positive number")
+    return value
+
+
+def valid_file(value):
+    if not Path(value).is_file():
+        raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
+    return value
--- a/src/auto_archiver/modules/api_db/init.py
+++ b/src/auto_archiver/modules/api_db/init.py
@ -1 +1 @@
-from api_db import AAApiDb
+from .api_db import AAApiDb
--- a/src/auto_archiver/modules/api_db/manifest.py
+++ b/src/auto_archiver/modules/api_db/manifest.py
@ -1,28 +1,49 @@
 {
    "name": "Auto-Archiver API Database",
    "type": ["database"],
-    "entry_point": "api_db:AAApiDb",
+    "entry_point": "api_db::AAApiDb",
    "requires_setup": True,
-    "external_dependencies": {
-        "python": ["requests",
-                   "loguru"],
+    "dependencies": {
+        "python": ["requests", "loguru"],
    },
    "configs": {
-            "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
-            "api_token": {"default": None, "help": "API Bearer token."},
-            "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
-            "author_id": {"default": None, "help": "which email to assign as author"},
-            "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
-            "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",},
-            "store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",},
-            "tags": {"default": [], "help": "what tags to add to the archived URL",}
+        "api_endpoint": {
+            "required": True,
+            "help": "API endpoint where calls are made to",
        },
+        "api_token": {"default": None,
+                      "help": "API Bearer token."},
+        "public": {
+            "default": False,
+            "type": "bool",
+            "help": "whether the URL should be publicly available via the API",
+        },
+        "author_id": {"default": None, "help": "which email to assign as author"},
+        "group_id": {
+            "default": None,
+            "help": "which group of users have access to the archive in case public=false as author",
+        },
+        "use_api_cache": {
+            "default": True,
+            "type": "bool",
+            "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
+        },
+        "store_results": {
+            "default": True,
+            "type": "bool",
+            "help": "when set, will send the results to the API database.",
+        },
+        "tags": {
+            "default": [],
+            "help": "what tags to add to the archived URL",
+        },
+    },
    "description": """
     Provides integration with the Auto-Archiver API for querying and storing archival data.

 ### Features
 - **API Integration**: Supports querying for existing archives and submitting results.
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
+- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
 - **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
 - **Tagging and Metadata**: Adds tags and manages metadata for archives.
 - **Optional Storage**: Archives results conditionally based on configuration.
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@ -1,5 +1,7 @@
 from typing import Union
-import requests, os
+
+import os
+import requests
 from loguru import logger

 from auto_archiver.core import Database
@ -7,27 +9,17 @@ from auto_archiver.core import Metadata


 class AAApiDb(Database):
-    """
-        Connects to auto-archiver-api instance
-    """
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.allow_rearchive = bool(self.allow_rearchive)
-        self.store_results = bool(self.store_results)
-        self.assert_valid_string("api_endpoint")
-
+    """Connects to auto-archiver-api instance"""

    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
        """ query the database for the existence of this item.
            Helps avoid re-archiving the same URL multiple times.
        """
-        if not self.allow_rearchive: return
-        
+        if not self.use_api_cache: return
+
        params = {"url": item.get_url(), "limit": 15}
        headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
-        response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
+        response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)

        if response.status_code == 200:
            if len(response.json()):
@ -38,21 +30,26 @@ class AAApiDb(Database):
            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
        return False

-
-    def done(self, item: Metadata, cached: bool=False) -> None:
+    def done(self, item: Metadata, cached: bool = False) -> None:
        """archival result ready - should be saved to DB"""
        if not self.store_results: return
-        if cached: 
+        if cached:
            logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
            return
        logger.debug(f"saving archive of {item.get_url()} to the AA API.")

-        payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
+        payload = {
+            'author_id': self.author_id,
+            'url': item.get_url(),
+            'public': self.public,
+            'group_id': self.group_id,
+            'tags': list(self.tags),
+            'result': item.to_json(),
+        }
        headers = {"Authorization": f"Bearer {self.api_token}"}
-        response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
+        response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)

-        if response.status_code == 200:
+        if response.status_code == 201:
            logger.success(f"AA API: {response.json()}")
        else:
            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
-
--- a/src/auto_archiver/modules/atlos/init.py
+++ b/src/auto_archiver/modules/atlos/init.py
@ -1 +0,0 @@
-from .atlos import AtlosStorage
--- a/src/auto_archiver/modules/atlos/manifest.py
+++ b/src/auto_archiver/modules/atlos/manifest.py
@ -1,40 +0,0 @@
-{
-    "name": "atlos_storage",
-    "type": ["storage"],
-    "requires_setup": True,
-    "external_dependencies": {"python": ["loguru", "requests"], "bin": [""]},
-    "configs": {
-        "path_generator": {
-            "default": "url",
-            "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
-        },
-        "filename_generator": {
-            "default": "random",
-            "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
-        },
-        "api_token": {
-            "default": None,
-            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
-            "type": "str",
-        },
-        "atlos_url": {
-            "default": "https://platform.atlos.org",
-            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
-            "type": "str",
-        },
-    },
-    "description": """
-    AtlosStorage: A storage module for saving media files to the Atlos platform.
-
-    ### Features
-    - Uploads media files to Atlos using Atlos-specific APIs.
-    - Automatically calculates SHA-256 hashes of media files for integrity verification.
-    - Skips uploads for files that already exist on Atlos with the same hash.
-    - Supports attaching metadata, such as `atlos_id`, to the uploaded files.
-    - Provides CDN-like URLs for accessing uploaded media.
-
-    ### Notes
-    - Requires Atlos API configuration, including `atlos_url` and `api_token`.
-    - Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
-    """,
-}
--- a/src/auto_archiver/modules/atlos_db/manifest.py
+++ b/src/auto_archiver/modules/atlos_db/manifest.py
@ -1,9 +1,9 @@
 {
    "name": "Atlos Database",
    "type": ["database"],
-    "entry_point": "atlos_db:AtlosDb",
+    "entry_point": "atlos_db::AtlosDb",
    "requires_setup": True,
-    "external_dependencies":
+    "dependencies":
        {"python": ["loguru",
                    ""],
         "bin": [""]},
--- a/src/auto_archiver/modules/atlos_db/atlos_db.py
+++ b/src/auto_archiver/modules/atlos_db/atlos_db.py
@ -1,14 +1,10 @@
-import os
-
 from typing import Union
-from loguru import logger
-from csv import DictWriter
-from dataclasses import asdict
+
 import requests
+from loguru import logger

 from auto_archiver.core import Database
 from auto_archiver.core import Metadata
-from auto_archiver.utils import get_atlos_config_options


 class AtlosDb(Database):
--- a/src/auto_archiver/modules/atlos_feeder/manifest.py
+++ b/src/auto_archiver/modules/atlos_feeder/manifest.py
@ -2,14 +2,14 @@
    "name": "Atlos Feeder",
    "type": ["feeder"],
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["loguru", "requests"],
    },
    "configs": {
        "api_token": {
-            "default": None,
+            "type": "str",
+            "required": True,
            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
-            "type": "str"
        },
        "atlos_url": {
            "default": "https://platform.atlos.org",
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@ -1,19 +1,12 @@
-from loguru import logger
 import requests
+from loguru import logger

 from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
-from auto_archiver.utils import get_atlos_config_options
+from auto_archiver.core import Metadata


 class AtlosFeeder(Feeder):

-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        if type(self.api_token) != str:
-            raise Exception("Atlos Feeder did not receive an Atlos API token")
-
    def __iter__(self) -> Metadata:
        # Get all the urls from the Atlos API
        count = 0
@ -47,5 +40,3 @@ class AtlosFeeder(Feeder):

            if len(data["results"]) == 0 or cursor is None:
                break
-
-        logger.success(f"Processed {count} URL(s)")
--- a/src/auto_archiver/modules/atlos_storage/atlos_storage.py
+++ b/src/auto_archiver/modules/atlos_storage/atlos_storage.py
@ -1,12 +1,12 @@
-import os
-from typing import IO, List, Optional
-from loguru import logger
-import requests
 import hashlib
+import os
+from typing import IO, Optional
+
+import requests
+from loguru import logger

 from auto_archiver.core import Media, Metadata
 from auto_archiver.core import Storage
-from auto_archiver.utils import get_atlos_config_options


 class AtlosStorage(Storage):
--- a/src/auto_archiver/modules/cli_feeder/init.py
+++ b/src/auto_archiver/modules/cli_feeder/init.py
@ -1 +0,0 @@
-from .cli_feeder import CLIFeeder
--- a/src/auto_archiver/modules/cli_feeder/manifest.py
+++ b/src/auto_archiver/modules/cli_feeder/manifest.py
@ -1,27 +0,0 @@
-{
-    "name": "CLI Feeder",
-    "type": ["feeder"],
-    "requires_setup": False,
-    "external_dependencies": {
-        "python": ["loguru"],
-    },
-    'entry_point': 'cli_feeder::CLIFeeder',
-    "configs": {
-        "urls": {
-            "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-            "nargs": "+",
-            "required": True,
-            "do_not_store": True,
-            "metavar": "INPUT URLS",
-        },
-    },
-    "description": """
-    Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
-
-    ### Features
-    - Takes a single URL or a list of URLs provided via the command line.
-    - Converts each URL into a `Metadata` object and yields it for processing.
-    - Ensures URLs are processed only if they are explicitly provided.
-
-    """
-}
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@ -1,15 +0,0 @@
-from loguru import logger
-
-from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
-
-
-class CLIFeeder(Feeder):
-
-    def __iter__(self) -> Metadata:
-        for url in self.urls:
-            logger.debug(f"Processing URL: '{url}'")
-            yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
--- a/src/auto_archiver/modules/console_db/manifest.py
+++ b/src/auto_archiver/modules/console_db/manifest.py
@ -2,7 +2,7 @@
    "name": "Console Database",
    "type": ["database"],
    "requires_setup": False,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["loguru"],
    },
    "description": """
--- a/src/auto_archiver/modules/csv_db/manifest.py
+++ b/src/auto_archiver/modules/csv_db/manifest.py
@ -2,7 +2,7 @@
    "name": "CSV Database",
    "type": ["database"],
    "requires_setup": False,
-    "external_dependencies": {"python": ["loguru"]
+    "dependencies": {"python": ["loguru"]
                              },
    'entry_point': 'csv_db::CSVDb',
    "configs": {
--- a/src/auto_archiver/modules/csv_feeder/manifest.py
+++ b/src/auto_archiver/modules/csv_feeder/manifest.py
@ -2,7 +2,7 @@
    "name": "CSV Feeder",
    "type": ["feeder"],
    "requires_setup": False,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["loguru"],
        "bin": [""]
    },
@ -13,6 +13,9 @@
                "default": None,
                "help": "Path to the input file(s) to read the URLs from, comma separated. \
                        Input files should be formatted with one URL per line",
+                "required": True,
+                "type": "valid_file",
+                "nargs": "+",
            },
            "column": {
                "default": None,
@ -26,9 +29,9 @@
    - Supports reading URLs from multiple input files, specified as a comma-separated list.
    - Allows specifying the column number or name to extract URLs from.
    - Skips header rows if the first value is not a valid URL.
-    - Integrates with the `ArchivingContext` to manage URL feeding.

-    ### Setu N
-    - Input files should be formatted with one URL per line.
+    ### Setup
+    - Input files should be formatted with one URL per line, with or without a header row.
+    - If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
    """
 }
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@ -2,24 +2,37 @@ from loguru import logger
 import csv

 from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
 from auto_archiver.utils import url_or_none

 class CSVFeeder(Feeder):

+    column = None
+
+
    def __iter__(self) -> Metadata:
-        url_column = self.column or 0
        for file in self.files:
            with open(file, "r") as f:
                reader = csv.reader(f)
                first_row = next(reader)
-                if not(url_or_none(first_row[url_column])):
-                    # it's a header row, skip it
+                url_column = self.column or 0
+                if isinstance(url_column, str):
+                    try:
+                        url_column = first_row.index(url_column)
+                    except ValueError:
+                        logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
+                        return
+                elif not(url_or_none(first_row[url_column])):
+                    # it's a header row, but we've been given a column number already
                    logger.debug(f"Skipping header row: {first_row}")
-                for row in reader:
-                    url = row[0]
-                    logger.debug(f"Processing {url}")
-                    yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
+                else:
+                    # first row isn't a header row, rewind the file
+                    f.seek(0)

-        logger.success(f"Processed {len(self.urls)} URL(s)")
+                for row in reader:
+                    if not url_or_none(row[url_column]):
+                        logger.warning(f"Not a valid URL in row: {row}, skipping")
+                        continue
+                    url = row[url_column]
+                    logger.debug(f"Processing {url}")
+                    yield Metadata().set_url(url)
--- a/src/auto_archiver/modules/gdrive_storage/manifest.py
+++ b/src/auto_archiver/modules/gdrive_storage/manifest.py
@ -1,14 +1,14 @@
 {
    "name": "Google Drive Storage",
    "type": ["storage"],
+    "author": "Dave Mateer",
+    "entry_point": "gdrive_storage::GDriveStorage",
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": [
            "loguru",
-            "google-api-python-client",
-            "google-auth",
-            "google-auth-oauthlib",
-            "google-auth-httplib2"
+            "googleapiclient",
+            "google",
        ],
    },
    "configs": {
@ -18,17 +18,23 @@
            "choices": ["flat", "url", "random"],
        },
        "filename_generator": {
-            "default": "random",
+            "default": "static",
            "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
            "choices": ["random", "static"],
        },
-        "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
-        "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
+        "root_folder_id": {"required": True,
+                           "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
+        "oauth_token": {"default": None,
+                        "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
        "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
    },
    "description": """
+    
    GDriveStorage: A storage module for saving archived content to Google Drive.

+    Author: Dave Mateer, (And maintained by: )
+    Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
+
    ### Features
    - Saves media files to Google Drive, organizing them into folders based on the provided path structure.
    - Supports OAuth token-based authentication or service account credentials for API access.
@ -39,5 +45,55 @@
    - Requires setup with either a Google OAuth token or a service account JSON file.
    - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
    - Automatically handles Google Drive API token refreshes for long-running jobs.
-    """
+    
+    ## Overview
+This module integrates Google Drive as a storage backend, enabling automatic folder creation and file uploads. It supports authentication via **service accounts** (recommended for automation) or **OAuth tokens** (for user-based authentication).
+
+## Features
+- Saves files to Google Drive, organizing them into structured folders.
+- Supports both **service account** and **OAuth token** authentication.
+- Automatically creates folders if they don't exist.
+- Generates public URLs for easy file sharing.
+
+## Setup Guide
+1. **Enable Google Drive API**
+   - Create a Google Cloud project at [Google Cloud Console](https://console.cloud.google.com/)
+   - Enable the **Google Drive API**.
+
+2. **Set Up a Google Drive Folder**
+   - Create a folder in **Google Drive** and copy its **folder ID** from the URL.
+   - Add the **folder ID** to your configuration (`orchestration.yaml`):
+     ```yaml
+     root_folder_id: "FOLDER_ID"
+     ```
+
+3. **Authentication Options**
+   - **Option 1: Service Account (Recommended)**
+     - Create a **service account** in Google Cloud IAM.
+     - Download the JSON key file and save it as:
+       ```
+       secrets/service_account.json
+       ```
+     - **Share your Drive folder** with the service account’s `client_email` (found in the JSON file).
+     
+   - **Option 2: OAuth Token (User Authentication)**
+     - Create OAuth **Desktop App credentials** in Google Cloud.
+     - Save the credentials as:
+       ```
+       secrets/oauth_credentials.json
+       ```
+     - Generate an OAuth token by running:
+       ```sh
+       python scripts/create_update_gdrive_oauth_token.py -c secrets/oauth_credentials.json
+       ```
+
+    
+    Notes on the OAuth token:
+    Tokens are refreshed after 1 hour however keep working for 7 days (tbc)
+    so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work
+    see this link for details on the token:
+    https://davemateer.com/2022/04/28/google-drive-with-python#tokens
+    
+    
+"""
 }
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@ -1,68 +1,67 @@

-import shutil, os, time, json
+import json
+import os
+import time
 from typing import IO
-from loguru import logger

-from googleapiclient.discovery import build
-from googleapiclient.http import MediaFileUpload
+from google.auth.transport.requests import Request
 from google.oauth2 import service_account
 from google.oauth2.credentials import Credentials
-from google.auth.transport.requests import Request
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+from loguru import logger

 from auto_archiver.core import Media
 from auto_archiver.core import Storage


+
+
 class GDriveStorage(Storage):

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
+    def setup(self) -> None:
+        self.scopes = ['https://www.googleapis.com/auth/drive']
+        # Initialize Google Drive service
+        self._setup_google_drive_service()

-        SCOPES = ['https://www.googleapis.com/auth/drive']
-
-        if self.oauth_token is not None:
-            """
-            Tokens are refreshed after 1 hour 
-            however keep working for 7 days (tbc)
-            so as long as the job doesn't last for 7 days
-            then this method of refreshing only once per run will work
-            see this link for details on the token
-            https://davemateer.com/2022/04/28/google-drive-with-python#tokens
-            """
-            logger.debug(f'Using GD OAuth token {self.oauth_token}')
-            # workaround for missing 'refresh_token' in from_authorized_user_file
-            with open(self.oauth_token, 'r') as stream:
-                creds_json = json.load(stream)
-                creds_json['refresh_token'] = creds_json.get("refresh_token", "")
-            creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
-            # creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES)
-
-            if not creds or not creds.valid:
-                if creds and creds.expired and creds.refresh_token:
-                    logger.debug('Requesting new GD OAuth token')
-                    creds.refresh(Request())
-                else:
-                    raise Exception("Problem with creds - create the token again")
-
-                # Save the credentials for the next run
-                with open(self.oauth_token, 'w') as token:
-                    logger.debug('Saving new GD OAuth token')
-                    token.write(creds.to_json())
-            else:
-                logger.debug('GD OAuth Token valid')
+    def _setup_google_drive_service(self):
+        """Initialize Google Drive service based on provided credentials."""
+        if self.oauth_token:
+            logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
+            self.service = self._initialize_with_oauth_token()
+        elif self.service_account:
+            logger.debug(f"Using Google Drive service account: {self.service_account}")
+            self.service = self._initialize_with_service_account()
        else:
-            gd_service_account = self.service_account
-            logger.debug(f'Using GD Service Account {gd_service_account}')
-            creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
+            raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")

-        self.service = build('drive', 'v3', credentials=creds)
+    def _initialize_with_oauth_token(self):
+        """Initialize Google Drive service with OAuth token."""
+        with open(self.oauth_token, 'r') as stream:
+            creds_json = json.load(stream)
+            creds_json['refresh_token'] = creds_json.get("refresh_token", "")
+
+        creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
+        if not creds.valid and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+            with open(self.oauth_token, 'w') as token_file:
+                logger.debug("Saving refreshed OAuth token.")
+                token_file.write(creds.to_json())
+        elif not creds.valid:
+            raise ValueError("Invalid OAuth token. Please regenerate the token.")
+
+        return build('drive', 'v3', credentials=creds)
+
+    def _initialize_with_service_account(self):
+        """Initialize Google Drive service with service account."""
+        creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
+        return build('drive', 'v3', credentials=creds)

    def get_cdn_url(self, media: Media) -> str:
        """
        only support files saved in a folder for GD
        S3 supports folder and all stored in the root
        """
-
        # full_name = os.path.join(self.folder, media.key)
        parent_id, folder_id = self.root_folder_id, None
        path_parts = media.key.split(os.path.sep)
@ -71,13 +70,16 @@ class GDriveStorage(Storage):
        for folder in path_parts[0:-1]:
            folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
            parent_id = folder_id
-
        # get id of file inside folder (or sub folder)
-        file_id = self._get_id_from_parent_and_name(folder_id, filename)
+        file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
+        if not file_id:
+            #
+            logger.info(f"file {filename} not found in folder {folder_id}")
+            return None
        return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"

    def upload(self, media: Media, **kwargs) -> bool:
-        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
+        logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
        """
        1. for each sub-folder in the path check if exists or create
        2. upload file to root_id/other_paths.../filename
@ -105,7 +107,13 @@ class GDriveStorage(Storage):
    # must be implemented even if unused
    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass

-    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
+    def _get_id_from_parent_and_name(self, parent_id: str,
+                                     name: str,
+                                     retries: int = 1,
+                                     sleep_seconds: int = 10,
+                                     use_mime_type: bool = False,
+                                     raise_on_missing: bool = True,
+                                     use_cache=False):
        """
        Retrieves the id of a folder or file from its @name and the @parent_id folder
        Optionally does multiple @retries and sleeps @sleep_seconds between them
@ -168,8 +176,3 @@ class GDriveStorage(Storage):
        gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
        return gd_folder.get('id')

-    # def exists(self, key):
-    #     try:
-    #         self.get_cdn_url(key)
-    #         return True
-    #     except: return False
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@ -20,6 +20,7 @@ the broader archiving framework.
 - Retrieves metadata like titles, descriptions, upload dates, and durations.
 - Downloads subtitles and comments when enabled.
 - Configurable options for handling live streams, proxies, and more.
+- Supports authentication of websites using the 'authentication' settings from your orchestration.

 ### Dropins
 - For websites supported by `yt-dlp` that also contain posts in addition to videos
@ -29,10 +30,6 @@ custom dropins can be created to handle additional websites and passed to the ar
 via the command line using the `--dropins` option (TODO!).
 """,
    "configs": {
-        "facebook_cookie": {
-            "default": None,
-            "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'",
-        },
        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
        "comments": {
            "default": False,
@ -67,14 +64,5 @@ via the command line using the `--dropins` option (TODO!).
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
-        "cookies_from_browser": {
-            "default": None,
-            "type": "str",
-            "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale",
-        },
-        "cookie_file": {
-            "default": None,
-            "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp",
-        },
    },
 }
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@ -23,19 +23,8 @@ class Bluesky(GenericDropin):

    def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
        # TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
-        # handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
-        # return ie_instance._extract_post(handle=handle, post_id=video_id)
-
        handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
-        return ie_instance._download_json(
-            'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
-            video_id, query={
-                'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
-                'depth': 0,
-                'parentHeight': 0,
-            })['thread']['post']
-
-
+        return ie_instance._extract_post(handle=handle, post_id=video_id)

    def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
        """
--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@ -0,0 +1,17 @@
+from .dropin import GenericDropin
+
+
+class Facebook(GenericDropin):
+    def extract_post(self, url: str, ie_instance):
+        video_id = ie_instance._match_valid_url(url).group('id')
+        ie_instance._download_webpage(
+            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
+        webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
+
+        post_data = ie_instance._extract_from_url.extract_metadata(webpage)
+        return post_data
+    
+    def create_metadata(self, post: dict, ie_instance, archiver, url):
+        metadata = archiver.create_metadata(url)
+        metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
+        return metadata
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
 from loguru import logger

 from auto_archiver.core.extractor import Extractor
-from ...core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media

 class GenericExtractor(Extractor):
    _dropins = {}
@ -266,19 +266,30 @@ class GenericExtractor(Extractor):
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

-        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
-            logger.debug('Using Facebook cookie')
-            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
-
-        if item.netloc in ['youtube.com', 'www.youtube.com']:
-            if self.cookies_from_browser:
-                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
-                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
-            elif self.cookie_file:
-                logger.debug(f'Using cookies from file {self.cookie_file}')
-                ydl_options['cookiefile'] = self.cookie_file
+        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
+                       'quiet': False, 'noplaylist': not self.allow_playlist ,
+                       'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
+                       "live_from_start": self.live_from_start, "proxy": self.proxy,
+                       "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+        
+        # set up auth
+        auth = self.auth_for_site(url, extract_cookies=False)
+        # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
+        if auth:
+            if 'username' in auth and 'password' in auth:
+                logger.debug(f'Using provided auth username and password for {url}')
+                ydl_options['username'] = auth['username']
+                ydl_options['password'] = auth['password']
+            elif 'cookie' in auth:
+                logger.debug(f'Using provided auth cookie for {url}')
+                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
+            elif 'cookie_from_browser' in auth:
+                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
+                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
+            elif 'cookies_file' in auth:
+                logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
+                ydl_options['cookiesfile'] = auth['cookies_file']

        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@ -5,7 +5,7 @@ from loguru import logger
 from slugify import slugify

 from auto_archiver.core.metadata import Metadata, Media
-from auto_archiver.utils import UrlUtil
+from auto_archiver.utils import url as UrlUtil
 from auto_archiver.core.extractor import Extractor

 from .dropin import GenericDropin, InfoExtractor
--- a/src/auto_archiver/modules/gsheet_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_db/manifest.py
@ -3,8 +3,8 @@
    "type": ["database"],
    "entry_point": "gsheet_db::GsheetsDb",
    "requires_setup": True,
-    "external_dependencies": {
-        "python": ["loguru", "gspread", "python-slugify"],
+    "dependencies": {
+        "python": ["loguru", "gspread", "slugify"],
    },
    "configs": {
        "allow_worksheets": {
@ -17,6 +17,7 @@
        },
        "use_sheet_names_in_stored_paths": {
            "default": True,
+            "type": "bool",
            "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
        }
    },
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@ -1,39 +1,38 @@
 from typing import Union, Tuple
-
-import datetime
 from urllib.parse import quote

 from loguru import logger

 from auto_archiver.core import Database
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.modules.gsheet_feeder import GWorksheet
+from auto_archiver.utils.misc import get_current_timestamp


 class GsheetsDb(Database):
    """
-        NB: only works if GsheetFeeder is used. 
-        could be updated in the future to support non-GsheetFeeder metadata 
+    NB: only works if GsheetFeeder is used.
+    could be updated in the future to support non-GsheetFeeder metadata
    """

    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")
        gw, row = self._retrieve_gsheet(item)
-        gw.set_cell(row, 'status', 'Archive in progress')
+        gw.set_cell(row, "status", "Archive in progress")

-    def failed(self, item: Metadata, reason:str) -> None:
+    def failed(self, item: Metadata, reason: str) -> None:
        logger.error(f"FAILED {item}")
-        self._safe_status_update(item, f'Archive failed {reason}')
+        self._safe_status_update(item, f"Archive failed {reason}")

    def aborted(self, item: Metadata) -> None:
        logger.warning(f"ABORTED {item}")
-        self._safe_status_update(item, '')
+        self._safe_status_update(item, "")

    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
        """check if the given item has been archived already"""
        return False

-    def done(self, item: Metadata, cached: bool=False) -> None:
+    def done(self, item: Metadata, cached: bool = False) -> None:
        """archival result ready - should be saved to DB"""
        logger.success(f"DONE {item.get_url()}")
        gw, row = self._retrieve_gsheet(item)
@ -45,23 +44,25 @@ class GsheetsDb(Database):
        def batch_if_valid(col, val, final_value=None):
            final_value = final_value or val
            try:
-                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
                    cell_updates.append((row, col, final_value))
            except Exception as e:
                logger.error(f"Unable to batch {col}={final_value} due to {e}")
+
        status_message = item.status
        if cached:
            status_message = f"[cached] {status_message}"
-        cell_updates.append((row, 'status', status_message))
+        cell_updates.append((row, "status", status_message))

        media: Media = item.get_final_media()
        if hasattr(media, "urls"):
-            batch_if_valid('archive', "\n".join(media.urls))
-        batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
-        batch_if_valid('title', item.get_title())
-        batch_if_valid('text', item.get("content", ""))
-        batch_if_valid('timestamp', item.get_timestamp())
-        if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
+            batch_if_valid("archive", "\n".join(media.urls))
+        batch_if_valid("date", True, get_current_timestamp())
+        batch_if_valid("title", item.get_title())
+        batch_if_valid("text", item.get("content", ""))
+        batch_if_valid("timestamp", item.get_timestamp())
+        if media:
+            batch_if_valid("hash", media.get("hash", "not-calculated"))

        # merge all pdq hashes into a single string, if present
        pdq_hashes = []
@ -70,34 +71,44 @@ class GsheetsDb(Database):
            if pdq := m.get("pdq_hash"):
                pdq_hashes.append(pdq)
        if len(pdq_hashes):
-            batch_if_valid('pdq_hash', ",".join(pdq_hashes))
+            batch_if_valid("pdq_hash", ",".join(pdq_hashes))

-        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
-            batch_if_valid('screenshot', "\n".join(screenshot.urls))
+        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
+            screenshot, "urls"
+        ):
+            batch_if_valid("screenshot", "\n".join(screenshot.urls))

-        if (thumbnail := item.get_first_image("thumbnail")):
+        if thumbnail := item.get_first_image("thumbnail"):
            if hasattr(thumbnail, "urls"):
-                batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
+                batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')

-        if (browsertrix := item.get_media_by_id("browsertrix")):
-            batch_if_valid('wacz', "\n".join(browsertrix.urls))
-            batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
+        if browsertrix := item.get_media_by_id("browsertrix"):
+            batch_if_valid("wacz", "\n".join(browsertrix.urls))
+            batch_if_valid(
+                "replaywebpage",
+                "\n".join(
+                    [
+                        f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
+                        for wacz in browsertrix.urls
+                    ]
+                ),
+            )

        gw.batch_set_cell(cell_updates)

    def _safe_status_update(self, item: Metadata, new_status: str) -> None:
        try:
            gw, row = self._retrieve_gsheet(item)
-            gw.set_cell(row, 'status', new_status)
+            gw.set_cell(row, "status", new_status)
        except Exception as e:
            logger.debug(f"Unable to update sheet: {e}")

    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
-        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
-        if gsheet := ArchivingContext.get("gsheet"):
+
+        if gsheet := item.get_context("gsheet"):
            gw: GWorksheet = gsheet.get("worksheet")
            row: int = gsheet.get("row")
        elif self.sheet_id:
-            print(self.sheet_id)
+            logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")

        return gw, row
--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@ -3,8 +3,8 @@
    "type": ["feeder"],
    "entry_point": "gsheet_feeder::GsheetsFeeder",
    "requires_setup": True,
-    "external_dependencies": {
-        "python": ["loguru", "gspread", "python-slugify"],
+    "dependencies": {
+        "python": ["loguru", "gspread", "slugify"],
    },
    "configs": {
        "sheet": {"default": None, "help": "name of the sheet to archive"},
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@ -15,14 +15,13 @@ from loguru import logger
 from slugify import slugify

 from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
 from . import GWorksheet


 class GsheetsFeeder(Feeder):

-    def setup(self, config: dict):
-        super().setup(config)
+    def setup(self) -> None:
        self.gsheets_client = gspread.service_account(filename=self.service_account)
        # TODO mv to validators
        assert self.sheet or self.sheet_id, (
@ -37,43 +36,48 @@ class GsheetsFeeder(Feeder):

    def __iter__(self) -> Metadata:
        sh = self.open_sheet()
-        for ii, wks in enumerate(sh.worksheets()):
-            if not self.should_process_sheet(wks.title):
-                logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
+        for ii, worksheet in enumerate(sh.worksheets()):
+            if not self.should_process_sheet(worksheet.title):
+                logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
                continue
-
-            logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}')
-            gw = GWorksheet(wks, header_row=self.header, columns=self.columns)
-
+            logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
+            gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
            if len(missing_cols := self.missing_required_columns(gw)):
-                logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
+                logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
                continue

-            for row in range(1 + self.header, gw.count_rows() + 1):
-                url = gw.get_cell(row, 'url').strip()
-                if not len(url): continue
+            # process and yield metadata here:
+            yield from self._process_rows(gw)
+            logger.success(f'Finished worksheet {worksheet.title}')

-                original_status = gw.get_cell(row, 'status')
-                status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
-                # TODO: custom status parser(?) aka should_retry_from_status
-                if status not in ['', None]: continue
+    def _process_rows(self, gw: GWorksheet):
+        for row in range(1 + self.header, gw.count_rows() + 1):
+            url = gw.get_cell(row, 'url').strip()
+            if not len(url): continue
+            original_status = gw.get_cell(row, 'status')
+            status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
+            # TODO: custom status parser(?) aka should_retry_from_status
+            if status not in ['', None]: continue

-                # All checks done - archival process starts here
-                m = Metadata().set_url(url)
-                ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
-                if gw.get_cell_or_default(row, 'folder', "") is None:
-                    folder = ''
-                else:
-                    folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
-                if len(folder):
-                    if self.use_sheet_names_in_stored_paths:
-                        ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
-                    else:
-                        ArchivingContext.set("folder", folder, True)
+            # All checks done - archival process starts here
+            m = Metadata().set_url(url)
+            self._set_context(m, gw, row)
+            yield m

-                yield m
+    def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
+        # TODO: Check folder value not being recognised
+        m.set_context("gsheet", {"row": row, "worksheet": gw})
+
+        if gw.get_cell_or_default(row, 'folder', "") is None:
+            folder = ''
+        else:
+            folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
+        if len(folder):
+            if self.use_sheet_names_in_stored_paths:
+                m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
+            else:
+                m.set_context("folder", folder)

-            logger.success(f'Finished worksheet {wks.title}')

    def should_process_sheet(self, sheet_name: str) -> bool:
        if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
--- a/src/auto_archiver/modules/hash_enricher/manifest.py
+++ b/src/auto_archiver/modules/hash_enricher/manifest.py
@ -2,7 +2,7 @@
    "name": "Hash Enricher",
    "type": ["enricher"],
    "requires_setup": False,
-    "external_dependencies": {
+    "dependencies": {
                          "python": ["loguru"],
    },
    "configs": {
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@ -11,7 +11,8 @@ import hashlib
 from loguru import logger

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
+from auto_archiver.utils.misc import calculate_file_hash


 class HashEnricher(Enricher):
@ -19,16 +20,6 @@ class HashEnricher(Enricher):
    Calculates hashes for Media instances
    """

-    def __init__(self, config: dict = None):
-        """
-        Initialize the HashEnricher with a configuration dictionary.
-        """
-        super().__init__()
-        # TODO set these from the manifest?
-        # Set default values
-        self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256"
-        self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7)
-

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
@ -39,15 +30,10 @@ class HashEnricher(Enricher):
                to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")

    def calculate_hash(self, filename) -> str:
-        hash = None
+        hash_algo = None
        if self.algorithm == "SHA-256":
-            hash = hashlib.sha256()
+            hash_algo = hashlib.sha256
        elif self.algorithm == "SHA3-512":
-            hash = hashlib.sha3_512()
+            hash_algo = hashlib.sha3_512
        else: return ""
-        with open(filename, "rb") as f:
-            while True:
-                buf = f.read(self.chunksize)
-                if not buf: break
-                hash.update(buf)
-        return hash.hexdigest()
+        return calculate_file_hash(filename, hash_algo, self.chunksize)
--- a/src/auto_archiver/modules/html_formatter/manifest.py
+++ b/src/auto_archiver/modules/html_formatter/manifest.py
@ -2,8 +2,8 @@
    "name": "HTML Formatter",
    "type": ["formatter"],
    "requires_setup": False,
-    "external_dependencies": {
-                          "python": ["loguru", "jinja2"],
+    "dependencies": {
+                          "python": ["hash_enricher", "loguru", "jinja2"],
                          "bin": [""]
    },
    "configs": {
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@ -1,5 +1,4 @@
 from __future__ import annotations
-from dataclasses import dataclass
 import mimetypes, os, pathlib
 from jinja2 import Environment, FileSystemLoader
 from urllib.parse import quote
@ -8,20 +7,18 @@ import json
 import base64

 from auto_archiver.version import __version__
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
+from auto_archiver.core.module import get_module

-
-@dataclass
 class HtmlFormatter(Formatter):
    environment: Environment = None
    template: any = None

-    def setup(self, config: dict) -> None:
+    def setup(self) -> None:
        """Sets up the Jinja2 environment and loads the template."""
-        super().setup(config)  # Ensure the base class logic is executed
        template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
        self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)

@ -48,12 +45,13 @@ class HtmlFormatter(Formatter):
            version=__version__
        )

-        html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
+        html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
        with open(html_path, mode="w", encoding="utf-8") as outf:
            outf.write(content)
        final_media = Media(filename=html_path, _mimetype="text/html")

-        he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
+        # get the already instantiated hash_enricher module
+        he = get_module('hash_enricher', self.config)
        if len(hd := he.calculate_hash(final_media.filename)):
            final_media.set("hash", f"{he.algorithm}:{hd}")

--- a/src/auto_archiver/modules/instagram_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/manifest.py
@ -1,7 +1,8 @@
 {
    "name": "Instagram API Extractor",
    "type": ["extractor"],
-    "external_dependencies":
+    "entry_point": "instagram_api_extractor::InstagramAPIExtractor",
+    "dependencies":
        {"python": ["requests",
                    "loguru",
                    "retrying",
@ -9,24 +10,31 @@
         },
    "requires_setup": True,
    "configs": {
-        "access_token": {"default": None, "help": "a valid instagrapi-api token"},
-        "api_endpoint": {"default": None, "help": "API endpoint to use"},
+        "access_token": {"default": None,
+                         "help": "a valid instagrapi-api token"},
+        "api_endpoint": {"required": True,
+                         "help": "API endpoint to use"},
        "full_profile": {
            "default": False,
+            "type": "bool",
            "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
        },
        "full_profile_max_posts": {
            "default": 0,
+            "type": "int",
            "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
        },
        "minimize_json_output": {
            "default": True,
+            "type": "bool",
            "help": "if true, will remove empty values from the json output",
        },
    },
    "description": """
 Archives various types of Instagram content using the Instagrapi API.

+Requires setting up an Instagrapi API deployment and providing an access token and API endpoint.
+
 ### Features
 - Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
 - Supports advanced configuration options, including:
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@ -28,20 +28,14 @@ class InstagramAPIExtractor(Extractor):
    # TODO: improvement collect aggregates of locations[0].location and mentions for all posts
    """

-    global_pattern = re.compile(
+    valid_url = re.compile(
        r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
    )

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.assert_valid_string("access_token")
-        self.assert_valid_string("api_endpoint")
-        self.full_profile_max_posts = int(self.full_profile_max_posts)
+    def setup(self) -> None:
        if self.api_endpoint[-1] == "/":
            self.api_endpoint = self.api_endpoint[:-1]

-        self.full_profile = bool(self.full_profile)
-        self.minimize_json_output = bool(self.minimize_json_output)

    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
@ -49,7 +43,7 @@ class InstagramAPIExtractor(Extractor):
        url.replace("instagr.com", "instagram.com").replace(
            "instagr.am", "instagram.com"
        )
-        insta_matches = self.global_pattern.findall(url)
+        insta_matches = self.valid_url.findall(url)
        logger.info(f"{insta_matches=}")
        if not len(insta_matches) or len(insta_matches[0]) != 3:
            return
--- a/src/auto_archiver/modules/instagram_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_extractor/manifest.py
@ -1,7 +1,7 @@
 {
    "name": "Instagram Extractor",
    "type": ["extractor"],
-    "external_dependencies": {
+    "dependencies": {
        "python": [
            "instaloader",
            "loguru",
@ -9,9 +9,10 @@
    },
    "requires_setup": True,
    "configs": {
-        "username": {"default": None, "help": "a valid Instagram username"},
+        "username": {"required": True,
+                     "help": "a valid Instagram username"},
        "password": {
-            "default": None,
+            "required": True,
            "help": "the corresponding Instagram account password",
        },
        "download_folder": {
@ -25,9 +26,11 @@
        # TODO: fine-grain
        # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
    },
-    "description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts
-                    and user profiles, downloading as much information as possible, including images, videos, text, stories,
-                    highlights, and tagged posts. Authentication is required via username/password or a session file.
+    "description": """
+    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
+    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+    highlights, and tagged posts. 
+    Authentication is required via username/password or a session file.
                    
                    """,
 }
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@ -4,7 +4,7 @@

 """
 import re, os, shutil, traceback
-import instaloader  # https://instaloader.github.io/as-module.html
+import instaloader
 from loguru import logger

 from auto_archiver.core import Extractor
@ -16,19 +16,17 @@ class InstagramExtractor(Extractor):
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
    # NB: post regex should be tested before profile
+
+    valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
+
    # https://regex101.com/r/MGPquX/1
-    post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
+    post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
    # https://regex101.com/r/6Wbsxa/1
-    profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
+    profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
    # TODO: links to stories

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        # TODO: refactor how configuration validation is done
-        self.assert_valid_string("username")
-        self.assert_valid_string("password")
-        self.assert_valid_string("download_folder")
-        self.assert_valid_string("session_file")
+    def setup(self) -> None:
+
        self.insta = instaloader.Instaloader(
            download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
        )
--- a/src/auto_archiver/modules/instagram_tbot_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/manifest.py
@ -1,15 +1,16 @@
 {
    "name": "Instagram Telegram Bot Extractor",
    "type": ["extractor"],
-    "external_dependencies": {"python": ["loguru",
-                                         "telethon",],
+    "dependencies": {"python": ["loguru", "telethon",],
                              },
    "requires_setup": True,
    "configs": {
            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
            "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
-            "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
+            "timeout": {"default": 45,
+                        "type": "int",
+                        "help": "timeout to fetch the instagram content in seconds."},
    },
    "description": """
 The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
@ -28,6 +29,12 @@ returned as part of a `Metadata` object.
 To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
 - **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
 - **Session File**: Optional path to store the Telegram session file for future use.
-
+- The session file is created automatically and should be unique for each instance.
+- You may need to enter your Telegram credentials (phone) and use the a 2FA code sent to you the first time you run the extractor.:
+```2025-01-30 00:43:49.348 | INFO     | auto_archiver.modules.instagram_tbot_extractor.instagram_tbot_extractor:setup:36 - SETUP instagram_tbot_extractor checking login...
+Please enter your phone (or bot token): +447123456789
+Please enter the code you received: 00000
+Signed in successfully as E C; remember to not break the ToS or you will risk an account ban!
+```
    """,
 }
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@ -16,7 +16,7 @@ from loguru import logger
 from telethon.sync import TelegramClient

 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str


@ -33,17 +33,30 @@ class InstagramTbotExtractor(Extractor):
        2. checks if the session file is valid
        """
        logger.info(f"SETUP {self.name} checking login...")
+        self._prepare_session_file()
+        self._initialize_telegram_client()

-        # make a copy of the session that is used exclusively with this archiver instance
+    def _prepare_session_file(self):
+        """
+        Creates a copy of the session file for exclusive use with this archiver instance.
+        Ensures that a valid session file exists before proceeding.
+        """
        new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
+        if not os.path.exists(f"{self.session_file}.session"):
+            raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
        shutil.copy(self.session_file + ".session", new_session_file)
        self.session_file = new_session_file.replace(".session", "")

+    def _initialize_telegram_client(self):
+        """Initializes the Telegram client."""
        try:
            self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
        except OperationalError as e:
-            logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
-
+            logger.error(
+                f"Unable to access the {self.session_file} session. "
+                "Ensure that you don't use the same session file here and in telethon_extractor. "
+                "If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
+            )
        with self.client.start():
            logger.success(f"SETUP {self.name} login works.")

@ -58,34 +71,51 @@ class InstagramTbotExtractor(Extractor):
        if not "instagram.com" in url: return False

        result = Metadata()
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        with self.client.start():
-            chat = self.client.get_entity("instagram_load_bot")
-            since_id = self.client.send_message(entity=chat, message=url).id

-            attempts = 0
-            seen_media = []
-            message = ""
-            time.sleep(3)
-            # media is added before text by the bot so it can be used as a stop-logic mechanism
-            while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
-                attempts += 1
-                time.sleep(1)
-                for post in self.client.iter_messages(chat, min_id=since_id):
-                    since_id = max(since_id, post.id)
-                    if post.media and post.id not in seen_media:
-                        filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
-                        media = self.client.download_media(post.media, filename_dest)
-                        if media: 
-                            result.add_media(Media(media))
-                            seen_media.append(post.id)
-                    if post.message: message += post.message
+            chat, since_id = self._send_url_to_bot(url)
+            message = self._process_messages(chat, since_id, tmp_dir, result)

-            if "You must enter a URL to a post" in message: 
+            if "You must enter a URL to a post" in message:
                logger.debug(f"invalid link {url=} for {self.name}: {message}")
                return False
-                
+            # # TODO: It currently returns this as a success - is that intentional?
+            # if "Media not found or unavailable" in message:
+            #     logger.debug(f"invalid link {url=} for {self.name}: {message}")
+            #     return False
+
            if message:
                result.set_content(message).set_title(message[:128])
-
            return result.success("insta-via-bot")
+
+    def _send_url_to_bot(self, url: str):
+        """
+        Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
+        """
+        chat = self.client.get_entity("instagram_load_bot")
+        since_message = self.client.send_message(entity=chat, message=url)
+        return chat, since_message.id
+
+    def _process_messages(self, chat, since_id, tmp_dir, result):
+        attempts = 0
+        seen_media = []
+        message = ""
+        time.sleep(3)
+        # media is added before text by the bot so it can be used as a stop-logic mechanism
+        while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
+            attempts += 1
+            time.sleep(1)
+            for post in self.client.iter_messages(chat, min_id=since_id):
+                since_id = max(since_id, post.id)
+                # Skip known filler message:
+                if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
+                    continue
+                if post.media and post.id not in seen_media:
+                    filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
+                    media = self.client.download_media(post.media, filename_dest)
+                    if media:
+                        result.add_media(Media(media))
+                        seen_media.append(post.id)
+                if post.message: message += post.message
+        return message.strip()
--- a/src/auto_archiver/modules/local_storage/manifest.py
+++ b/src/auto_archiver/modules/local_storage/manifest.py
@ -2,7 +2,7 @@
    "name": "Local Storage",
    "type": ["storage"],
    "requires_setup": False,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["loguru"],
    },
    "configs": {
--- a/src/auto_archiver/modules/meta_enricher/manifest.py
+++ b/src/auto_archiver/modules/meta_enricher/manifest.py
@ -2,7 +2,7 @@
    "name": "Archive Metadata Enricher",
    "type": ["enricher"],
    "requires_setup": False,
-    "external_dependencies": {
+    "dependencies": {
                          "python": ["loguru"],
    },
    "description": """ 
--- a/src/auto_archiver/modules/metadata_enricher/manifest.py
+++ b/src/auto_archiver/modules/metadata_enricher/manifest.py
@ -2,7 +2,7 @@
    "name": "Media Metadata Enricher",
    "type": ["enricher"],
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["loguru"], 
        "bin": ["exiftool"]
    },
--- a/src/auto_archiver/modules/mute_formatter/manifest.py
+++ b/src/auto_archiver/modules/mute_formatter/manifest.py
@ -2,7 +2,7 @@
    "name": "Mute Formatter",
    "type": ["formatter"],
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
    },
    "description": """ Default formatter.
    """,
--- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py
+++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py
@ -1,11 +1,9 @@
 from __future__ import annotations
-from dataclasses import dataclass

 from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter


-@dataclass
 class MuteFormatter(Formatter):

    def format(self, item: Metadata) -> Media: return None
--- a/src/auto_archiver/modules/pdq_hash_enricher/manifest.py
+++ b/src/auto_archiver/modules/pdq_hash_enricher/manifest.py
@ -2,8 +2,8 @@
    "name": "PDQ Hash Enricher",
    "type": ["enricher"],
    "requires_setup": False,
-    "external_dependencies": {
-        "python": ["loguru", "pdqhash", "numpy", "Pillow"],
+    "dependencies": {
+        "python": ["loguru", "pdqhash", "numpy", "PIL"],
    },
    "description": """
    PDQ Hash Enricher for generating perceptual hashes of media files.
--- a/src/auto_archiver/modules/s3_storage/init.py
+++ b/src/auto_archiver/modules/s3_storage/init.py
@ -1 +1 @@
-from .s3 import S3Storage
+from .s3_storage import S3Storage
--- a/src/auto_archiver/modules/s3_storage/manifest.py
+++ b/src/auto_archiver/modules/s3_storage/manifest.py
@ -2,17 +2,17 @@
    "name": "S3 Storage",
    "type": ["storage"],
    "requires_setup": True,
-    "external_dependencies": {
-        "python": ["boto3", "loguru"],
+    "dependencies": {
+        "python": ["hash_enricher", "boto3", "loguru"],
    },
    "configs": {
        "path_generator": {
-            "default": "url",
+            "default": "flat",
            "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
            "choices": ["flat", "url", "random"],
        },
        "filename_generator": {
-            "default": "random",
+            "default": "static",
            "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
            "choices": ["random", "static"],
        },
@ -20,7 +20,9 @@
        "region": {"default": None, "help": "S3 region name"},
        "key": {"default": None, "help": "S3 API key"},
        "secret": {"default": None, "help": "S3 API secret"},
-        "random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
+        "random_no_duplicate": {"default": False,
+                                "type": "bool",
+                                "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
        "endpoint_url": {
            "default": 'https://{region}.digitaloceanspaces.com',
            "help": "S3 bucket endpoint, {region} are inserted at runtime"
@ -29,7 +31,9 @@
            "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
            "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
        },
-        "private": {"default": False, "help": "if true S3 files will not be readable online"},
+        "private": {"default": False,
+                    "type": "bool",
+                    "help": "if true S3 files will not be readable online"},
    },
    "description": """
    S3Storage: A storage module for saving media files to an S3-compatible object storage.
@ -45,5 +49,6 @@
    - Requires S3 credentials (API key and secret) and a bucket name to function.
    - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
    - Uses `boto3` for interaction with the S3 API.
+    - Depends on the `HashEnricher` module for hash calculation.
    """
 }
--- a/src/auto_archiver/modules/s3_storage/s3_storage.py
+++ b/src/auto_archiver/modules/s3_storage/s3_storage.py
@ -1,19 +1,19 @@

 from typing import IO
-import boto3, os

-from auto_archiver.utils.misc import random_str
-from auto_archiver.core import Media
-from auto_archiver.core import Storage
-
-from auto_archiver.modules.hash_enricher import HashEnricher
+import boto3
+import os
 from loguru import logger

+from auto_archiver.core import Media
+from auto_archiver.core import Storage
+from auto_archiver.utils.misc import calculate_file_hash, random_str
+
 NO_DUPLICATES_FOLDER = "no-dups/"
+
 class S3Storage(Storage):

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
+    def setup(self) -> None:
        self.s3 = boto3.client(
            's3',
            region_name=self.region,
@ -21,7 +21,6 @@ class S3Storage(Storage):
            aws_access_key_id=self.key,
            aws_secret_access_key=self.secret
        )
-        self.random_no_duplicate = bool(self.random_no_duplicate)
        if self.random_no_duplicate:
            logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")

@ -41,15 +40,13 @@ class S3Storage(Storage):
                    extra_args['ContentType'] = media.mimetype
            except Exception as e:
                logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
-
        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
        return True
    
    def is_upload_needed(self, media: Media) -> bool:
        if self.random_no_duplicate:
            # checks if a folder with the hash already exists, if so it skips the upload
-            he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
-            hd = he.calculate_hash(media.filename)
+            hd = calculate_file_hash(media.filename)
            path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])

            if existing_key:=self.file_in_folder(path):
@ -61,8 +58,7 @@ class S3Storage(Storage):
            _, ext = os.path.splitext(media.key)
            media.key = os.path.join(path, f"{random_str(24)}{ext}")
        return True
-    
-    
+
    def file_in_folder(self, path:str) -> str:
        # checks if path exists and is not an empty folder
        if not path.endswith('/'):
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@ -2,7 +2,7 @@
    "name": "Screenshot Enricher",
    "type": ["enricher"],
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["loguru", "selenium"],
        "bin": ["chromedriver"]
    },
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@ -6,8 +6,8 @@ from selenium.common.exceptions import TimeoutException


 from auto_archiver.core import Enricher
-from auto_archiver.utils import Webdriver, UrlUtil, random_str
-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
+from auto_archiver.core import Media, Metadata

 class ScreenshotEnricher(Enricher):

@ -19,15 +19,17 @@ class ScreenshotEnricher(Enricher):
            return

        logger.debug(f"Enriching screenshot for {url=}")
-        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
+        auth = self.auth_for_site(url)
+        with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
+                       http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
            try:
                driver.get(url)
                time.sleep(int(self.sleep_before_screenshot))
-                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
+                screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
                driver.save_screenshot(screenshot_file)
                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
                if self.save_to_pdf:
-                    pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
+                    pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
                    pdf = driver.print_page(driver.print_options)
                    with open(pdf_file, "wb") as f:
                        f.write(base64.b64decode(pdf))
--- a/src/auto_archiver/modules/ssl_enricher/manifest.py
+++ b/src/auto_archiver/modules/ssl_enricher/manifest.py
@ -2,8 +2,8 @@
    "name": "SSL Certificate Enricher",
    "type": ["enricher"],
    "requires_setup": False,
-    "external_dependencies": {
-        "python": ["loguru", "python-slugify"],
+    "dependencies": {
+        "python": ["loguru", "slugify"],
    },
    'entry_point': 'ssl_enricher::SSLEnricher',
    "configs": {
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from loguru import logger

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext, Media
+from auto_archiver.core import Metadata, Media


 class SSLEnricher(Enricher):
@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
        logger.debug(f"fetching SSL certificate for {domain=} in {url=}")

        cert = ssl.get_server_certificate((domain, 443))
-        cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
+        cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
        with open(cert_fn, "w") as f: f.write(cert)
        to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
--- a/src/auto_archiver/modules/telegram_extractor/manifest.py
+++ b/src/auto_archiver/modules/telegram_extractor/manifest.py
@ -2,7 +2,7 @@
    "name": "Telegram Extractor",
    "type": ["extractor"],
    "requires_setup": False,
-    "external_dependencies": {
+    "dependencies": {
        "python": [
            "requests",
            "bs4",
@ -13,7 +13,7 @@
        The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. 
        It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` 
        and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` 
-        is advised for more comprehensive functionality.
+        is advised for more comprehensive functionality, and higher quality media extraction.
        
        ### Features
 - Extracts images and videos from public Telegram message links (`t.me`).
--- a/src/auto_archiver/modules/telethon_extractor/init.py
+++ b/src/auto_archiver/modules/telethon_extractor/init.py
@ -1 +1 @@
-from .telethon_extractor import TelethonArchiver
+from .telethon_extractor import TelethonExtractor
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@ -2,7 +2,7 @@
    "name": "telethon_extractor",
    "type": ["extractor"],
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["telethon",
                   "loguru",
                   "tqdm",
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@ -6,19 +6,20 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
 from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
 from loguru import logger
 from tqdm import tqdm
-import re, time, json, os
+import re, time, os

 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str


-class TelethonArchiver(Extractor):
-    link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
+class TelethonExtractor(Extractor):
+    valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
    invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")


    def setup(self) -> None:
+
        """
        1. makes a copy of session_file that is removed in cleanup
        2. trigger login process for telegram or proceed if already saved in a session file
@ -92,7 +93,7 @@ class TelethonArchiver(Extractor):
        """
        url = item.get_url()
        # detect URLs that we definitely cannot handle
-        match = self.link_pattern.search(url)
+        match = self.valid_url.search(url)
        logger.debug(f"TELETHON: {match=}")
        if not match: return False

@ -120,7 +121,7 @@ class TelethonArchiver(Extractor):
            media_posts = self._get_media_posts_in_group(chat, post)
            logger.debug(f'got {len(media_posts)=} for {url=}')

-            tmp_dir = ArchivingContext.get_tmp_dir()
+            tmp_dir = self.tmp_dir

            group_id = post.grouped_id if post.grouped_id is not None else post.id
            title = post.message
--- a/src/auto_archiver/modules/thumbnail_enricher/manifest.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/manifest.py
@ -2,8 +2,8 @@
    "name": "Thumbnail Enricher",
    "type": ["enricher"],
    "requires_setup": False,
-    "external_dependencies": {
-        "python": ["loguru", "ffmpeg-python"],
+    "dependencies": {
+        "python": ["loguru", "ffmpeg"],
        "bin": ["ffmpeg"]
    },
    "configs": {
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@ -10,7 +10,7 @@ import ffmpeg, os
 from loguru import logger

 from auto_archiver.core import Enricher
-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata
 from auto_archiver.utils.misc import random_str


@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
        logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
        for m_id, m in enumerate(to_enrich.media[::]):
            if m.is_video():
-                folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
+                folder = os.path.join(self.tmp_dir, random_str(24))
                os.makedirs(folder, exist_ok=True)
                logger.debug(f"generating thumbnails for {m.filename}")
                duration = m.get("duration")
--- a/src/auto_archiver/modules/timestamping_enricher/manifest.py
+++ b/src/auto_archiver/modules/timestamping_enricher/manifest.py
@ -2,7 +2,7 @@
    "name": "Timestamping Enricher",
    "type": ["enricher"],
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": [
            "loguru",
            "slugify",
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@ -10,8 +10,7 @@ from asn1crypto.core import Asn1Value
 import certifi

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext, Media
-
+from auto_archiver.core import Metadata, Media

 class TimestampingEnricher(Enricher):
    """
@ -33,7 +32,7 @@ class TimestampingEnricher(Enricher):
            logger.warning(f"No hashes found in {url=}")
            return
        
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        hashes_fn = os.path.join(tmp_dir, "hashes.txt")

        data_to_sign = "\n".join(hashes)
@ -102,9 +101,9 @@ class TimestampingEnricher(Enricher):

        cert_chain = []
        for cert in path:
-            cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
+            cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
            with open(cert_fn, "wb") as f:
                f.write(cert.dump())
            cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))

-        return cert_chain
+        return cert_chain
--- a/src/auto_archiver/modules/twitter_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/manifest.py
@ -2,7 +2,7 @@
    "name": "Twitter API Extractor",
    "type": ["extractor"],
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": ["requests",
                   "loguru",
                   "pytwitter",
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@ -9,14 +9,13 @@ from pytwitter import Api
 from slugify import slugify

 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata,Media
+from auto_archiver.core import Metadata, Media

 class TwitterApiExtractor(Extractor):
-    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

-    def setup(self, config: dict) -> None:
-        super().setup(config)
+    valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

+    def setup(self) -> None:
        self.api_index = 0
        self.apis = []
        if len(self.bearer_tokens):
@ -54,7 +53,7 @@ class TwitterApiExtractor(Extractor):

    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
-        matches = self.link_pattern.findall(url)
+        matches = self.valid_url.findall(url)
        if not len(matches): return False, False

        username, tweet_id = matches[0]  # only one URL supported
--- a/src/auto_archiver/modules/vk_extractor/manifest.py
+++ b/src/auto_archiver/modules/vk_extractor/manifest.py
@ -3,15 +3,19 @@
    "type": ["extractor"],
    "requires_setup": True,
    "depends": ["core", "utils"],
-    "external_dependencies": {
-        "python": ["loguru",
-                   "vk_url_scraper"],
+    "dependencies": {
+        "python": ["loguru", "vk_url_scraper"],
    },
    "configs": {
-            "username": {"default": None, "help": "valid VKontakte username"},
-            "password": {"default": None, "help": "valid VKontakte password"},
-            "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
+        "username": {"required": True,
+                     "help": "valid VKontakte username"},
+        "password": {"required": True,
+                     "help": "valid VKontakte password"},
+        "session_file": {
+            "default": "secrets/vk_config.v2.json",
+            "help": "valid VKontakte password",
        },
+    },
    "description": """
 The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. 
 This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract 
@ -31,6 +35,5 @@ To use the `VkArchiver`, you must provide valid VKontakte login credentials and

 Credentials can be set in the configuration file or directly via environment variables. Ensure you 
 have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
-"""
-,
+""",
 }
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper

 from auto_archiver.utils.misc import dump_payload
 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media


 class VkExtractor(Extractor):
@ -12,10 +12,7 @@ class VkExtractor(Extractor):
    Currently only works for /wall posts
    """

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.assert_valid_string("username")
-        self.assert_valid_string("password")
+    def setup(self) -> None:
        self.vks = VkScraper(self.username, self.password, session_file=self.session_file)

    def download(self, item: Metadata) -> Metadata:
@ -37,7 +34,7 @@ class VkExtractor(Extractor):

        result.set_content(dump_payload(vk_scrapes))

-        filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
+        filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
        for filename in filenames:
            result.add_media(Media(filename))

--- a/src/auto_archiver/modules/wacz_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_enricher/manifest.py
@ -1,8 +1,9 @@
 {
    "name": "WACZ Enricher",
    "type": ["enricher", "archiver"],
+    "entry_point": "wacz_enricher::WaczExtractorEnricher",
    "requires_setup": True,
-    "external_dependencies": {
+    "dependencies": {
        "python": [
            "loguru",
            "jsonlines",
@ -25,6 +26,7 @@
        },
    "description": """
    Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
+    [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.

    ### Features
    - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
@ -33,7 +35,7 @@
    - Generates metadata from the archived page's content and structure (e.g., titles, text).

    ### Notes
-    - Requires Docker for running `browsertrix-crawler` unless explicitly disabled.
+    - Requires Docker for running `browsertrix-crawler` .
    - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
    """
 }
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@ -5,9 +5,9 @@ from zipfile import ZipFile
 from loguru import logger
 from warcio.archiveiterator import ArchiveIterator

-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata
 from auto_archiver.core import Extractor, Enricher
-from auto_archiver.utils import UrlUtil, random_str
+from auto_archiver.utils import url as UrlUtil, random_str


 class WaczExtractorEnricher(Enricher, Extractor):
@ -19,6 +19,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
    """

    def setup(self) -> None:
+
        self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
        self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')

@ -49,7 +50,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
        url = to_enrich.get_url()

        collection = random_str(8)
-        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
+        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

        cmd = [
@ -152,7 +153,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
        logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")

        # unzipping the .wacz
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        unzipped_dir = os.path.join(tmp_dir, "unzipped")
        with ZipFile(wacz_filename, 'r') as z_obj:
            z_obj.extractall(path=unzipped_dir)
--- a/src/auto_archiver/modules/wayback_enricher/init.py
+++ b/src/auto_archiver/modules/wayback_enricher/init.py
@ -1 +0,0 @@
-from .wayback_enricher import WaybackExtractorEnricher
--- a/src/auto_archiver/modules/wayback_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_enricher/manifest.py
@ -1,30 +0,0 @@
-{
-    "name": "Wayback Machine Enricher",
-    "type": ["enricher", "archiver"],
-    "requires_setup": True,
-    "external_dependencies": {
-        "python": ["loguru", "requests"],
-    },
-    "entry_point": "wayback_enricher::WaybackExtractorEnricher",
-    "configs": {
-        "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
-        "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
-        "key": {"default": None, "required": True, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
-        "secret": {"default": None, "required": True, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
-        "proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
-        "proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
-    },
-    "description": """
-    Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
-
-    ### Features
-    - Archives URLs using the Internet Archive's Wayback Machine API.
-    - Supports conditional archiving based on the existence of prior archives within a specified time range.
-    - Provides proxies for HTTP and HTTPS requests.
-    - Fetches and confirms the archive URL or provides a job ID for later status checks.
-
-    ### Notes
-    - Requires a valid Wayback Machine API key and secret.
-    - Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
-    """
-}
--- a/src/auto_archiver/modules/wayback_extractor_enricher/init.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/init.py
@ -0,0 +1 @@
+from .wayback_extractor_enricher import WaybackExtractorEnricher
--- a/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/manifest.py
@ -0,0 +1,56 @@
+{
+    "name": "Wayback Machine Enricher",
+    "type": ["enricher", "archiver"],
+    "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
+    "requires_setup": True,
+    "dependencies": {
+        "python": ["loguru", "requests"],
+    },
+    "configs": {
+        "timeout": {
+            "default": 15,
+            "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
+        },
+        "if_not_archived_within": {
+            "default": None,
+            "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA",
+        },
+        "key": {
+            "required": True,
+            "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php",
+        },
+        "secret": {
+            "required": True,
+            "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php",
+        },
+        "proxy_http": {
+            "default": None,
+            "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port",
+        },
+        "proxy_https": {
+            "default": None,
+            "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port",
+        },
+    },
+    "description": """
+    Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
+
+    ### Features
+    - Archives URLs using the Internet Archive's Wayback Machine API.
+    - Supports conditional archiving based on the existence of prior archives within a specified time range.
+    - Provides proxies for HTTP and HTTPS requests.
+    - Fetches and confirms the archive URL or provides a job ID for later status checks.
+
+    ### Notes
+    - Requires a valid Wayback Machine API key and secret.
+    - Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
+    
+    ### Steps to Get an Wayback API Key:
+    - Sign up for an account at [Internet Archive](https://archive.org/account/signup).
+    - Log in to your account.
+    - Navigte to your [account settings](https://archive.org/account).
+    - or: https://archive.org/developers/tutorial-get-ia-credentials.html
+    - Under Wayback Machine API Keys, generate a new key.
+    - Note down your API key and secret, as they will be required for authentication.
+    """,
+}
--- a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py
+++ b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py
@ -3,7 +3,7 @@ from loguru import logger
 import time, requests

 from auto_archiver.core import Extractor, Enricher
-from auto_archiver.utils import UrlUtil
+from auto_archiver.utils import url as UrlUtil
 from auto_archiver.core import Metadata

 class WaybackExtractorEnricher(Enricher, Extractor):
--- a/src/auto_archiver/modules/whisper_enricher/manifest.py
+++ b/src/auto_archiver/modules/whisper_enricher/manifest.py
@ -2,15 +2,19 @@
    "name": "Whisper Enricher",
    "type": ["enricher"],
    "requires_setup": True,
-    "external_dependencies": {
-        "python": ["loguru", "requests"],
+    "dependencies": {
+        "python": ["s3_storage", "loguru", "requests"],
    },
    "configs": {
-        "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
-        "api_key": {"default": None, "help": "WhisperApi api key for authentication"},
+        "api_endpoint": {"required": True,
+                         "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
+        "api_key": {"required": True,
+                    "help": "WhisperApi api key for authentication"},
        "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
        "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
-        "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
+        "action": {"default": "translate",
+                   "help": "which Whisper operation to execute",
+                   "choices": ["transcribe", "translate", "language_detection"]},
    },
    "description": """
    Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
@ -25,6 +29,7 @@
    ### Notes
    - Requires a Whisper API endpoint and API key for authentication.
    - Only compatible with S3-compatible storage systems for media file accessibility.
+    - ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
    - Handles multiple jobs and retries for failed or incomplete processing.
    """
 }
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@ -3,9 +3,8 @@ import requests, time
 from loguru import logger

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, Media, ArchivingContext
-from auto_archiver.modules.s3_storage import S3Storage
-
+from auto_archiver.core import Metadata, Media
+from auto_archiver.core.module import get_module

 class WhisperEnricher(Enricher):
    """
@ -14,18 +13,25 @@ class WhisperEnricher(Enricher):
    Only works if an S3 compatible storage is used
    """

-    def enrich(self, to_enrich: Metadata) -> None:
-        if not self._get_s3_storage():
+    def setup(self) -> None:
+        self.stores = self.config['steps']['storages']
+        self.s3 = get_module("s3_storage", self.config)
+        if not "s3_storage" in self.stores:
            logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
            return

+
+    def enrich(self, to_enrich: Metadata) -> None:
+
        url = to_enrich.get_url()
        logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")

        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                m.store(url=url, metadata=to_enrich)
+                # TODO: this used to pass all storage items to store now
+                # Now only passing S3, the rest will get added later in the usual order (?)
+                m.store(url=url, metadata=to_enrich, storages=[self.s3])
                try:
                    job_id = self.submit_job(m)
                    job_results[job_id] = False
@ -53,8 +59,8 @@ class WhisperEnricher(Enricher):
                            to_enrich.set_content(f"\n[automatic video transcript]: {v}")

    def submit_job(self, media: Media):
-        s3 = self._get_s3_storage()
-        s3_url = s3.get_cdn_url(media)
+
+        s3_url = self.s3.get_cdn_url(media)
        assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
        payload = {
            "url": s3_url,
@ -107,10 +113,3 @@ class WhisperEnricher(Enricher):
            logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
            return result
        return False
-
-    def _get_s3_storage(self) -> S3Storage:
-        try:
-            return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
-        except:
-            logger.warning("No S3Storage instance found in storages")
-            return
--- a/src/auto_archiver/utils/init.py
+++ b/src/auto_archiver/utils/init.py
@ -2,7 +2,6 @@
 # we need to explicitly expose the available imports here
 from .misc import *
 from .webdriver import Webdriver
-from .url import UrlUtil
 from .atlos import get_atlos_config_options

 # handy utils from ytdlp
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
@ -1,53 +0,0 @@
-import json, gspread
-
-from ..core import BaseModule
-
-
-class Gsheets(BaseModule):
-    name = "gsheets"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.gsheets_client = gspread.service_account(filename=self.service_account)
-        # TODO: config should be responsible for conversions
-        try: self.header = int(self.header)
-        except: pass
-        assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
-        assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
-
-    # TODO merge this into gsheets processors manifest
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "sheet": {"default": None, "help": "name of the sheet to archive"},
-            "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
-            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
-            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
-            "columns": {
-                "default": {
-                    'url': 'link',
-                    'status': 'archive status',
-                    'folder': 'destination folder',
-                    'archive': 'archive location',
-                    'date': 'archive date',
-                    'thumbnail': 'thumbnail',
-                    'timestamp': 'upload timestamp',
-                    'title': 'upload title',
-                    'text': 'text content',
-                    'screenshot': 'screenshot',
-                    'hash': 'hash',
-                    'pdq_hash': 'perceptual hashes',
-                    'wacz': 'wacz',
-                    'replaywebpage': 'replaywebpage',
-                },
-                "help": "names of columns in the google sheet (stringified JSON object)",
-                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
-            },
-        }
-
-    def open_sheet(self):
-        if self.sheet:
-            return self.gsheets_client.open(self.sheet)
-        else:  # self.sheet_id
-            return self.gsheets_client.open_by_key(self.sheet_id)
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@ -1,7 +1,9 @@
-
-import os, json, requests
+import os
+import json
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
+import requests
+import hashlib
 from loguru import logger


@ -51,9 +53,52 @@ def update_nested_dict(dictionary, update_dict):
        else:
            dictionary[key] = value

+
 def random_str(length: int = 32) -> str:
    assert length <= 32, "length must be less than 32 as UUID4 is used"
    return str(uuid.uuid4()).replace("-", "")[:length]

+
 def json_loader(cli_val):
    return json.loads(cli_val)
+
+
+def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
+    hash = hash_algo()
+    with open(filename, "rb") as f:
+        while True:
+            buf = f.read(chunksize)
+            if not buf: break
+            hash.update(buf)
+    return hash.hexdigest()
+
+def get_current_datetime_iso() -> str:
+    return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat()
+
+
+def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None:
+    # parse a datetime string with option of passing a specific format
+    try:
+        return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str)
+    except ValueError as e:
+        logger.error(f"Unable to parse datestring {dt_str}: {e}")
+        return None
+
+
+def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
+    # Consistent parsing of timestamps
+    # If utc=True, the timezone is set to UTC,
+    # if iso=True, the output is an iso string
+    if not ts: return
+    try:
+        if isinstance(ts, str): ts = datetime.fromisoformat(ts)
+        if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
+        if utc: ts = ts.replace(tzinfo=timezone.utc)
+        if iso: return ts.isoformat()
+        return ts
+    except Exception as e:
+        logger.error(f"Unable to parse timestamp {ts}: {e}")
+        return None
+
+def get_current_timestamp() -> str:
+    return get_timestamp(datetime.now())
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@ -1,79 +1,84 @@
 import re
 from urllib.parse import urlparse, urlunparse

-class UrlUtil:
-    telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
-    is_istagram = re.compile(r"https:\/\/www\.instagram\.com")

-    @staticmethod
-    def clean(url: str) -> str: return url
+AUTHWALL_URLS = [
+    re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
+    re.compile(r"https:\/\/www\.instagram\.com"), # instagram
+]

-    @staticmethod
-    def is_auth_wall(url: str) -> bool:
-        """
-        checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
-        """
-        if UrlUtil.telegram_private.match(url): return True
-        if UrlUtil.is_istagram.match(url): return True
+def domain_for_url(url: str) -> str:
+    """
+    SECURITY: parse the domain using urllib to avoid any potential security issues
+    """
+    return urlparse(url).netloc

-        return False
+def clean(url: str) -> str:
+    return url

-    @staticmethod
-    def remove_get_parameters(url: str) -> str:
-        # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
-        # useful for mimetypes to work
-        parsed_url = urlparse(url)
-        new_url = urlunparse(parsed_url._replace(query=''))
-        return new_url
+def is_auth_wall(url: str) -> bool:
+    """
+    checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
+    """
+    for regex in AUTHWALL_URLS:
+        if regex.match(url):
+            return True

-    @staticmethod
-    def is_relevant_url(url: str) -> bool:
-        """
-        Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
-        """
-        clean_url = UrlUtil.remove_get_parameters(url)
+    return False

-        # favicons
-        if "favicon" in url: return False
-        # ifnore icons
-        if clean_url.endswith(".ico"): return False
-        # ignore SVGs
-        if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
+def remove_get_parameters(url: str) -> str:
+    # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
+    # useful for mimetypes to work
+    parsed_url = urlparse(url)
+    new_url = urlunparse(parsed_url._replace(query=''))
+    return new_url

-        # twitter profile pictures
-        if "twimg.com/profile_images" in url: return False
-        if "twimg.com" in url and "/default_profile_images" in url: return False
+def is_relevant_url(url: str) -> bool:
+    """
+    Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
+    """
+    clean_url = remove_get_parameters(url)

-        # instagram profile pictures
-        if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
-        # instagram recurring images
-        if "https://static.cdninstagram.com/rsrc.php/" in url: return False
+    # favicons
+    if "favicon" in url: return False
+    # ifnore icons
+    if clean_url.endswith(".ico"): return False
+    # ignore SVGs
+    if remove_get_parameters(url).endswith(".svg"): return False

-        # telegram
-        if "https://telegram.org/img/emoji/" in url: return False
+    # twitter profile pictures
+    if "twimg.com/profile_images" in url: return False
+    if "twimg.com" in url and "/default_profile_images" in url: return False

-        # youtube
-        if "https://www.youtube.com/s/gaming/emoji/" in url: return False
-        if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
-        if "https://www.youtube.com/s/search/audio/" in url: return False
+    # instagram profile pictures
+    if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
+    # instagram recurring images
+    if "https://static.cdninstagram.com/rsrc.php/" in url: return False

-        # ok
-        if " https://ok.ru/res/i/" in url: return False
+    # telegram
+    if "https://telegram.org/img/emoji/" in url: return False

-        # vk
-        if "https://vk.com/emoji/" in url: return False
-        if "vk.com/images/" in url: return False
-        if "vk.com/images/reaction/" in url: return False
+    # youtube
+    if "https://www.youtube.com/s/gaming/emoji/" in url: return False
+    if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
+    if "https://www.youtube.com/s/search/audio/" in url: return False

-        # wikipedia
-        if "wikipedia.org/static" in url: return False
+    # ok
+    if " https://ok.ru/res/i/" in url: return False

-        return True
+    # vk
+    if "https://vk.com/emoji/" in url: return False
+    if "vk.com/images/" in url: return False
+    if "vk.com/images/reaction/" in url: return False

-    @staticmethod
-    def twitter_best_quality_url(url: str) -> str:
-        """
-        some twitter image URLs point to a less-than best quality
-        this returns the URL pointing to the highest (original) quality
-        """
-        return re.sub(r"name=(\w+)", "name=orig", url, 1)
+    # wikipedia
+    if "wikipedia.org/static" in url: return False
+
+    return True
+
+def twitter_best_quality_url(url: str) -> str:
+    """
+    some twitter image URLs point to a less-than best quality
+    this returns the URL pointing to the highest (original) quality
+    """
+    return re.sub(r"name=(\w+)", "name=orig", url, 1)
--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@ -9,12 +9,79 @@ from loguru import logger
 from selenium.webdriver.common.by import By
 import time

+#import domain_for_url
+from urllib.parse import urlparse, urlunparse
+from http.cookiejar import MozillaCookieJar

+class CookieSettingDriver(webdriver.Firefox):
+
+    facebook_accept_cookies: bool
+    cookies: str
+    cookiejar: MozillaCookieJar
+
+    def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
+        super(CookieSettingDriver, self).__init__(*args, **kwargs)
+        self.cookies = cookies
+        self.cookiejar = cookiejar
+        self.facebook_accept_cookies = facebook_accept_cookies
+
+    def get(self, url: str):
+        if self.cookies or self.cookiejar:
+            # set up the driver to make it not 'cookie averse' (needs a context/URL)
+            # get the 'robots.txt' file which should be quick and easy
+            robots_url = urlunparse(urlparse(url)._replace(path='/robots.txt', query='', fragment=''))
+            super(CookieSettingDriver, self).get(robots_url)
+
+            if self.cookies:
+                # an explicit cookie is set for this site, use that first
+                for cookie in self.cookies.split(";"):
+                    for name, value in cookie.split("="):
+                        self.driver.add_cookie({'name': name, 'value': value})
+            elif self.cookiejar:
+                domain = urlparse(url).netloc.lstrip("www.")
+                for cookie in self.cookiejar:
+                    if domain in cookie.domain:
+                        try:
+                            self.add_cookie({
+                                'name': cookie.name,
+                                'value': cookie.value,
+                                'path': cookie.path,
+                                'domain': cookie.domain,
+                                'secure': bool(cookie.secure),
+                                'expiry': cookie.expires
+                            })
+                        except Exception as e:
+                            logger.warning(f"Failed to add cookie to webdriver: {e}")
+        
+        if self.facebook_accept_cookies:
+            try:
+                logger.debug(f'Trying fb click accept cookie popup.')
+                super(CookieSettingDriver, self).get("http://www.facebook.com")
+                essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
+                essential_only.click()
+                logger.debug(f'fb click worked')
+                # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
+                time.sleep(2)
+            except Exception as e:
+                logger.warning(f'Failed on fb accept cookies.', e)
+        # now get the actual URL
+        super(CookieSettingDriver, self).get(url)
+        if self.facebook_accept_cookies:
+            # try and click the 'close' button on the 'login' window to close it
+            close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
+            if close_button:
+                close_button.click()
+
+
+    
 class Webdriver:
-    def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
+    def __init__(self, width: int, height: int, timeout_seconds: int,
+                 facebook_accept_cookies: bool = False, http_proxy: str = "",
+                 print_options: dict = {}, auth: dict = {}) -> webdriver:
        self.width = width
        self.height = height
        self.timeout_seconds = timeout_seconds
+        self.auth = auth
        self.facebook_accept_cookies = facebook_accept_cookies
        self.http_proxy = http_proxy
        # create and set print options
@ -23,32 +90,26 @@ class Webdriver:
            setattr(self.print_options, k, v)

    def __enter__(self) -> webdriver:
+
        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")
        options.add_argument(f'--proxy-server={self.http_proxy}')
        options.set_preference('network.protocol-handler.external.tg', False)
+        # if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
+        if self.facebook_accept_cookies:
+            options.add_argument('--lang=en')
+
        try:
-            self.driver = webdriver.Firefox(options=options)
+            self.driver = CookieSettingDriver(cookies=self.auth.get('cookies'), cookiejar=self.auth.get('cookies_jar'),
+                                              facebook_accept_cookies=self.facebook_accept_cookies, options=options)
            self.driver.set_window_size(self.width, self.height)
            self.driver.set_page_load_timeout(self.timeout_seconds)
            self.driver.print_options = self.print_options
        except TimeoutException as e:
            logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")

-        if self.facebook_accept_cookies:
-            try:
-                logger.debug(f'Trying fb click accept cookie popup.')
-                self.driver.get("http://www.facebook.com")
-                foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
-                foo.click()
-                logger.debug(f'fb click worked')
-                # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
-                time.sleep(2)
-            except:
-                logger.warning(f'Failed on fb accept cookies.')
-
        return self.driver
-
+    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.driver.close()
        self.driver.quit()
--- a/tests/init.py
+++ b/tests/init.py
@ -1,6 +0,0 @@
-import tempfile
-
-from auto_archiver.core.context import ArchivingContext
-
-ArchivingContext.reset(full_reset=True)
-ArchivingContext.set_tmp_dir(tempfile.gettempdir())
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,7 +1,9 @@
 """
 pytest conftest file, for shared fixtures and configuration
 """
-
+import os
+import pickle
+from tempfile import TemporaryDirectory
 from typing import Dict, Tuple
 import hashlib
 import pytest
@ -23,13 +25,15 @@ def setup_module(request):
            # if the class does not have a .name, use the name of the parent folder
            module_name = module_name.__module__.rsplit(".",2)[-2]

-        m = get_module(module_name).load()
-        m.name = module_name
-        m.setup({module_name : config})
+        m = get_module(module_name, {module_name: config})

+        # add the tmp_dir to the module
+        tmp_dir = TemporaryDirectory()
+        m.tmp_dir = tmp_dir.name

        def cleanup():
            _LAZY_LOADED_MODULES.pop(module_name)
+            tmp_dir.cleanup()
        request.addfinalizer(cleanup)

        return m
@ -110,4 +114,18 @@ def pytest_runtest_setup(item):
            test_name = _test_failed_incremental[cls_name].get((), None)
            # if name found, test has failed for the combination of class name & test name
            if test_name is not None:
-                pytest.xfail(f"previous test failed ({test_name})")
+                pytest.xfail(f"previous test failed ({test_name})")
+
+
+
+@pytest.fixture()
+def unpickle():
+    """
+    Returns a helper function that unpickles a file
+    ** gets the file from the test_files directory: tests/data/test_files **
+    """
+    def _unpickle(path):
+        test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files")
+        with open(os.path.join(test_data_dir, path), "rb") as f:
+            return pickle.load(f)
+    return _unpickle
--- a/tests/data/csv_no_headers.csv
+++ b/tests/data/csv_no_headers.csv
@ -0,0 +1,2 @@
+https://example.com/1/,data 1
+https://example.com/2/,data 2
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`from .wayback_enricher import WaybackExtractorEnricher`
				`@ -0,0 +1 @@`
				`from .wayback_extractor_enricher import WaybackExtractorEnricher`