Merge pull request #263 from bellingcat/wrong_steps

When loading modules, check they have been added to the right 'step' in the config
2025-03-20 15:31:38 +00:00 · 2025-03-20 15:31:38 +00:00 · b997bbea2b
commit b997bbea2b
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@ -85,7 +85,11 @@ class ModuleFactory:
        if not available:
            message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
            if "archiver" in module_name:
-                message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
+                message += f" Did you mean '{module_name.replace('archiver', 'extractor')}'?"
+            elif "gsheet" in module_name:
+                message += " Did you mean 'gsheet_feeder_db'?"
+            elif "atlos" in module_name:
+                message += " Did you mean 'atlos_feeder_db_storage'?"
            raise IndexError(message)
        return available[0]

--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -373,9 +373,17 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                if module in invalid_modules:
                    continue

+                # check to make sure that we're trying to load it as the correct type - i.e. make sure the user hasn't put it under the wrong 'step'
+                lazy_module: LazyBaseModule = self.module_factory.get_module_lazy(module)
+                if module_type not in lazy_module.type:
+                    types = ",".join(f"'{t}'" for t in lazy_module.type)
+                    raise SetupError(
+                        f"Configuration Error: Module '{module}' is not a {module_type}, but has the types: {types}. Please check you set this module up under the right step in your orchestration file."
+                    )
+
                loaded_module = None
                try:
-                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
+                    loaded_module: BaseModule = lazy_module.load(self.config)
                except (KeyboardInterrupt, Exception) as e:
                    if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
                        logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@ -1,6 +1,5 @@
 import re
 import mimetypes
-import json

 from loguru import logger
 from slugify import slugify
@ -32,6 +31,9 @@ class Twitter(GenericDropin):
        twid = ie_instance._match_valid_url(url).group("id")
        return ie_instance._extract_status(twid=twid)

+    def keys_to_clean(self, video_data, info_extractor):
+        return ["user", "created_at", "entities", "favorited", "translator_type"]
+
    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        try:
@ -42,9 +44,11 @@ class Twitter(GenericDropin):
            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
            return False

-        result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
-            timestamp
-        )
+        full_text = tweet.pop("full_text", "")
+        author = tweet["user"].get("name", "")
+        result.set("author", author).set_url(url)
+
+        result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
        if not tweet.get("entities", {}).get("media"):
            logger.debug("No media found, archiving tweet text only")
            result.status = "twitter-ytdl"
--- a/tests/data/test_modules/example_extractor/manifest.py
+++ b/tests/data/test_modules/example_extractor/manifest.py
@ -0,0 +1,11 @@
+{
+    # Display Name of your module
+    "name": "Example Extractor",
+    # Optional version number, for your own versioning purposes
+    "version": 2.0,
+    # The type of the module, must be one (or more) of the built in module types
+    "type": ["extractor"],
+    # a boolean indicating whether or not a module requires additional user setup before it can be used
+    # for example: adding API keys, installing additional software etc.
+    "requires_setup": False,
+}
--- a/tests/data/test_modules/example_extractor/example_extractor.py
+++ b/tests/data/test_modules/example_extractor/example_extractor.py
@ -0,0 +1,6 @@
+from auto_archiver.core import Extractor
+
+
+class ExampleExtractor(Extractor):
+    def download(self, item):
+        print("download")
--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@ -206,10 +206,11 @@ class TestGenericExtractor(TestExtractorBase):

        self.assertValidResponseMetadata(
            post,
-            "Onion rings are just vegetable donuts.",
+            "Cookie Monster - Onion rings are just vegetable donuts.",
            datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
            "yt-dlp_Twitter: success",
        )
+        assert post.get("content") == "Onion rings are just vegetable donuts."

    @pytest.mark.download
    def test_twitter_download_video(self, make_item):
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@ -4,6 +4,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
 from auto_archiver.version import __version__
 from auto_archiver.core.config import read_yaml, store_yaml
 from auto_archiver.core import Metadata
+from auto_archiver.core.consts import SetupError

 TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
 TEST_MODULES = "tests/data/test_modules/"
@ -224,3 +225,15 @@ def test_multiple_orchestrator(test_args):
    output: Metadata = list(o2.feed())
    assert len(output) == 1
    assert output[0].get_url() == "https://example.com"
+
+
+def test_wrong_step_type(test_args, caplog):
+    args = test_args + [
+        "--feeders",
+        "example_extractor",  # example_extractor is not a valid feeder!
+    ]
+
+    orchestrator = ArchivingOrchestrator()
+    with pytest.raises(SetupError) as err:
+        orchestrator.setup(args)
+        assert "Module 'example_extractor' is not a feeder" in str(err.value)