From 5e5e1c43a179e678b8ae5614788763dc5eae0e83 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 20 Mar 2025 18:09:26 +0400 Subject: [PATCH 1/3] When loading modules, check they have been added to the right 'step' in the config Fixes an issue seen on discord where a user accidentally set up metadata_enricher under 'extractors' --- src/auto_archiver/core/module.py | 6 +++++- src/auto_archiver/core/orchestrator.py | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 903a4ab..c263d95 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -85,7 +85,11 @@ class ModuleFactory: if not available: message = f"Module '{module_name}' not found. Are you sure it's installed/exists?" if "archiver" in module_name: - message += f" Did you mean {module_name.replace('archiver', 'extractor')}?" + message += f" Did you mean '{module_name.replace('archiver', 'extractor')}'?" + elif "gsheet" in module_name: + message += " Did you mean 'gsheet_feeder_db'?" + elif "atlos" in module_name: + message += " Did you mean 'atlos_feeder_db_storage'?" raise IndexError(message) return available[0] diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index d06c287..cbd1af5 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -373,9 +373,17 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ if module in invalid_modules: continue + # check to make sure that we're trying to load it as the correct type - i.e. make sure the user hasn't put it under the wrong 'step' + lazy_module: LazyBaseModule = self.module_factory.get_module_lazy(module) + if module_type not in lazy_module.type: + types = ",".join(f"'{t}'" for t in lazy_module.type) + raise SetupError( + f"Configuration Error: Module '{module}' is not a {module_type}, but has the types: {types}. Please check you set this module up under the right step in your orchestration file." + ) + loaded_module = None try: - loaded_module: BaseModule = self.module_factory.get_module(module, self.config) + loaded_module: BaseModule = lazy_module.load(self.config) except (KeyboardInterrupt, Exception) as e: if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError): logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") From 6700250891dbefddad536e58caddd372ffec1166 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 20 Mar 2025 18:18:53 +0400 Subject: [PATCH 2/3] Add a test for checking module type on setup --- .../test_modules/example_extractor/__manifest__.py | 11 +++++++++++ .../example_extractor/example_extractor.py | 6 ++++++ tests/test_orchestrator.py | 13 +++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 tests/data/test_modules/example_extractor/__manifest__.py create mode 100644 tests/data/test_modules/example_extractor/example_extractor.py diff --git a/tests/data/test_modules/example_extractor/__manifest__.py b/tests/data/test_modules/example_extractor/__manifest__.py new file mode 100644 index 0000000..dc18dc7 --- /dev/null +++ b/tests/data/test_modules/example_extractor/__manifest__.py @@ -0,0 +1,11 @@ +{ + # Display Name of your module + "name": "Example Extractor", + # Optional version number, for your own versioning purposes + "version": 2.0, + # The type of the module, must be one (or more) of the built in module types + "type": ["extractor"], + # a boolean indicating whether or not a module requires additional user setup before it can be used + # for example: adding API keys, installing additional software etc. + "requires_setup": False, +} diff --git a/tests/data/test_modules/example_extractor/example_extractor.py b/tests/data/test_modules/example_extractor/example_extractor.py new file mode 100644 index 0000000..1c63383 --- /dev/null +++ b/tests/data/test_modules/example_extractor/example_extractor.py @@ -0,0 +1,6 @@ +from auto_archiver.core import Extractor + + +class ExampleExtractor(Extractor): + def download(self, item): + print("download") diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 326b93d..3367ce0 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -4,6 +4,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.version import __version__ from auto_archiver.core.config import read_yaml, store_yaml from auto_archiver.core import Metadata +from auto_archiver.core.consts import SetupError TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" TEST_MODULES = "tests/data/test_modules/" @@ -224,3 +225,15 @@ def test_multiple_orchestrator(test_args): output: Metadata = list(o2.feed()) assert len(output) == 1 assert output[0].get_url() == "https://example.com" + + +def test_wrong_step_type(test_args, caplog): + args = test_args + [ + "--feeders", + "example_extractor", # example_extractor is not a valid feeder! + ] + + orchestrator = ArchivingOrchestrator() + with pytest.raises(SetupError) as err: + orchestrator.setup(args) + assert "Module 'example_extractor' is not a feeder" in str(err.value) From 0a5ba3385e75f8ce1878638a407d49489e0635f8 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 20 Mar 2025 18:55:22 +0400 Subject: [PATCH 3/3] Fix small bug in twitter dropin - previously the 'content' was being set to a json dump of the tweet, it should be set to full_text --- .../modules/generic_extractor/twitter.py | 12 ++++++++---- tests/extractors/test_generic_extractor.py | 3 ++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index e27a0c1..189a7e6 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -1,6 +1,5 @@ import re import mimetypes -import json from loguru import logger from slugify import slugify @@ -32,6 +31,9 @@ class Twitter(GenericDropin): twid = ie_instance._match_valid_url(url).group("id") return ie_instance._extract_status(twid=twid) + def keys_to_clean(self, video_data, info_extractor): + return ["user", "created_at", "entities", "favorited", "translator_type"] + def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: result = Metadata() try: @@ -42,9 +44,11 @@ class Twitter(GenericDropin): logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") return False - result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp( - timestamp - ) + full_text = tweet.pop("full_text", "") + author = tweet["user"].get("name", "") + result.set("author", author).set_url(url) + + result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp) if not tweet.get("entities", {}).get("media"): logger.debug("No media found, archiving tweet text only") result.status = "twitter-ytdl" diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py index 2089007..616183b 100644 --- a/tests/extractors/test_generic_extractor.py +++ b/tests/extractors/test_generic_extractor.py @@ -206,10 +206,11 @@ class TestGenericExtractor(TestExtractorBase): self.assertValidResponseMetadata( post, - "Onion rings are just vegetable donuts.", + "Cookie Monster - Onion rings are just vegetable donuts.", datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc), "yt-dlp_Twitter: success", ) + assert post.get("content") == "Onion rings are just vegetable donuts." @pytest.mark.download def test_twitter_download_video(self, make_item):