From 5e5e1c43a179e678b8ae5614788763dc5eae0e83 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Thu, 20 Mar 2025 18:09:26 +0400
Subject: [PATCH 1/3] When loading modules, check they have been added to the
 right 'step' in the config

Fixes an issue seen on discord where a user accidentally set up metadata_enricher under 'extractors'
---
 src/auto_archiver/core/module.py       |  6 +++++-
 src/auto_archiver/core/orchestrator.py | 10 +++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py
index 903a4ab..c263d95 100644
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -85,7 +85,11 @@ class ModuleFactory:
         if not available:
             message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
             if "archiver" in module_name:
-                message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
+                message += f" Did you mean '{module_name.replace('archiver', 'extractor')}'?"
+            elif "gsheet" in module_name:
+                message += " Did you mean 'gsheet_feeder_db'?"
+            elif "atlos" in module_name:
+                message += " Did you mean 'atlos_feeder_db_storage'?"
             raise IndexError(message)
         return available[0]
 
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index d06c287..cbd1af5 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -373,9 +373,17 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                 if module in invalid_modules:
                     continue
 
+                # check to make sure that we're trying to load it as the correct type - i.e. make sure the user hasn't put it under the wrong 'step'
+                lazy_module: LazyBaseModule = self.module_factory.get_module_lazy(module)
+                if module_type not in lazy_module.type:
+                    types = ",".join(f"'{t}'" for t in lazy_module.type)
+                    raise SetupError(
+                        f"Configuration Error: Module '{module}' is not a {module_type}, but has the types: {types}. Please check you set this module up under the right step in your orchestration file."
+                    )
+
                 loaded_module = None
                 try:
-                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
+                    loaded_module: BaseModule = lazy_module.load(self.config)
                 except (KeyboardInterrupt, Exception) as e:
                     if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
                         logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")

From 6700250891dbefddad536e58caddd372ffec1166 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Thu, 20 Mar 2025 18:18:53 +0400
Subject: [PATCH 2/3] Add a test for checking module type on setup

---
 .../test_modules/example_extractor/__manifest__.py  | 11 +++++++++++
 .../example_extractor/example_extractor.py          |  6 ++++++
 tests/test_orchestrator.py                          | 13 +++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 tests/data/test_modules/example_extractor/__manifest__.py
 create mode 100644 tests/data/test_modules/example_extractor/example_extractor.py

diff --git a/tests/data/test_modules/example_extractor/__manifest__.py b/tests/data/test_modules/example_extractor/__manifest__.py
new file mode 100644
index 0000000..dc18dc7
--- /dev/null
+++ b/tests/data/test_modules/example_extractor/__manifest__.py
@@ -0,0 +1,11 @@
+{
+    # Display Name of your module
+    "name": "Example Extractor",
+    # Optional version number, for your own versioning purposes
+    "version": 2.0,
+    # The type of the module, must be one (or more) of the built in module types
+    "type": ["extractor"],
+    # a boolean indicating whether or not a module requires additional user setup before it can be used
+    # for example: adding API keys, installing additional software etc.
+    "requires_setup": False,
+}
diff --git a/tests/data/test_modules/example_extractor/example_extractor.py b/tests/data/test_modules/example_extractor/example_extractor.py
new file mode 100644
index 0000000..1c63383
--- /dev/null
+++ b/tests/data/test_modules/example_extractor/example_extractor.py
@@ -0,0 +1,6 @@
+from auto_archiver.core import Extractor
+
+
+class ExampleExtractor(Extractor):
+    def download(self, item):
+        print("download")
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index 326b93d..3367ce0 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -4,6 +4,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
 from auto_archiver.version import __version__
 from auto_archiver.core.config import read_yaml, store_yaml
 from auto_archiver.core import Metadata
+from auto_archiver.core.consts import SetupError
 
 TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
 TEST_MODULES = "tests/data/test_modules/"
@@ -224,3 +225,15 @@ def test_multiple_orchestrator(test_args):
     output: Metadata = list(o2.feed())
     assert len(output) == 1
     assert output[0].get_url() == "https://example.com"
+
+
+def test_wrong_step_type(test_args, caplog):
+    args = test_args + [
+        "--feeders",
+        "example_extractor",  # example_extractor is not a valid feeder!
+    ]
+
+    orchestrator = ArchivingOrchestrator()
+    with pytest.raises(SetupError) as err:
+        orchestrator.setup(args)
+        assert "Module 'example_extractor' is not a feeder" in str(err.value)

From 0a5ba3385e75f8ce1878638a407d49489e0635f8 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Thu, 20 Mar 2025 18:55:22 +0400
Subject: [PATCH 3/3] Fix small bug in twitter dropin - previously the
 'content' was being set to a json dump of the tweet, it should be set to
 full_text

---
 .../modules/generic_extractor/twitter.py             | 12 ++++++++----
 tests/extractors/test_generic_extractor.py           |  3 ++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py
index e27a0c1..189a7e6 100644
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -1,6 +1,5 @@
 import re
 import mimetypes
-import json
 
 from loguru import logger
 from slugify import slugify
@@ -32,6 +31,9 @@ class Twitter(GenericDropin):
         twid = ie_instance._match_valid_url(url).group("id")
         return ie_instance._extract_status(twid=twid)
 
+    def keys_to_clean(self, video_data, info_extractor):
+        return ["user", "created_at", "entities", "favorited", "translator_type"]
+
     def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
         result = Metadata()
         try:
@@ -42,9 +44,11 @@ class Twitter(GenericDropin):
             logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
             return False
 
-        result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
-            timestamp
-        )
+        full_text = tweet.pop("full_text", "")
+        author = tweet["user"].get("name", "")
+        result.set("author", author).set_url(url)
+
+        result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
         if not tweet.get("entities", {}).get("media"):
             logger.debug("No media found, archiving tweet text only")
             result.status = "twitter-ytdl"
diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py
index 2089007..616183b 100644
--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@@ -206,10 +206,11 @@ class TestGenericExtractor(TestExtractorBase):
 
         self.assertValidResponseMetadata(
             post,
-            "Onion rings are just vegetable donuts.",
+            "Cookie Monster - Onion rings are just vegetable donuts.",
             datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
             "yt-dlp_Twitter: success",
         )
+        assert post.get("content") == "Onion rings are just vegetable donuts."
 
     @pytest.mark.download
     def test_twitter_download_video(self, make_item):