Merge pull request #263 from bellingcat/wrong_steps

When loading modules, check they have been added to the right 'step' in the config
scoop
Patrick Robertson 2025-03-20 15:31:38 +00:00 zatwierdzone przez GitHub
commit b997bbea2b
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
7 zmienionych plików z 54 dodań i 7 usunięć

Wyświetl plik

@ -85,7 +85,11 @@ class ModuleFactory:
if not available:
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
if "archiver" in module_name:
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
message += f" Did you mean '{module_name.replace('archiver', 'extractor')}'?"
elif "gsheet" in module_name:
message += " Did you mean 'gsheet_feeder_db'?"
elif "atlos" in module_name:
message += " Did you mean 'atlos_feeder_db_storage'?"
raise IndexError(message)
return available[0]

Wyświetl plik

@ -373,9 +373,17 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
if module in invalid_modules:
continue
# check to make sure that we're trying to load it as the correct type - i.e. make sure the user hasn't put it under the wrong 'step'
lazy_module: LazyBaseModule = self.module_factory.get_module_lazy(module)
if module_type not in lazy_module.type:
types = ",".join(f"'{t}'" for t in lazy_module.type)
raise SetupError(
f"Configuration Error: Module '{module}' is not a {module_type}, but has the types: {types}. Please check you set this module up under the right step in your orchestration file."
)
loaded_module = None
try:
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
loaded_module: BaseModule = lazy_module.load(self.config)
except (KeyboardInterrupt, Exception) as e:
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")

Wyświetl plik

@ -1,6 +1,5 @@
import re
import mimetypes
import json
from loguru import logger
from slugify import slugify
@ -32,6 +31,9 @@ class Twitter(GenericDropin):
twid = ie_instance._match_valid_url(url).group("id")
return ie_instance._extract_status(twid=twid)
def keys_to_clean(self, video_data, info_extractor):
return ["user", "created_at", "entities", "favorited", "translator_type"]
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
result = Metadata()
try:
@ -42,9 +44,11 @@ class Twitter(GenericDropin):
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
return False
result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
timestamp
)
full_text = tweet.pop("full_text", "")
author = tweet["user"].get("name", "")
result.set("author", author).set_url(url)
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"):
logger.debug("No media found, archiving tweet text only")
result.status = "twitter-ytdl"

Wyświetl plik

@ -0,0 +1,11 @@
{
# Display Name of your module
"name": "Example Extractor",
# Optional version number, for your own versioning purposes
"version": 2.0,
# The type of the module, must be one (or more) of the built in module types
"type": ["extractor"],
# a boolean indicating whether or not a module requires additional user setup before it can be used
# for example: adding API keys, installing additional software etc.
"requires_setup": False,
}

Wyświetl plik

@ -0,0 +1,6 @@
from auto_archiver.core import Extractor
class ExampleExtractor(Extractor):
def download(self, item):
print("download")

Wyświetl plik

@ -206,10 +206,11 @@ class TestGenericExtractor(TestExtractorBase):
self.assertValidResponseMetadata(
post,
"Onion rings are just vegetable donuts.",
"Cookie Monster - Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"yt-dlp_Twitter: success",
)
assert post.get("content") == "Onion rings are just vegetable donuts."
@pytest.mark.download
def test_twitter_download_video(self, make_item):

Wyświetl plik

@ -4,6 +4,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
from auto_archiver.version import __version__
from auto_archiver.core.config import read_yaml, store_yaml
from auto_archiver.core import Metadata
from auto_archiver.core.consts import SetupError
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
TEST_MODULES = "tests/data/test_modules/"
@ -224,3 +225,15 @@ def test_multiple_orchestrator(test_args):
output: Metadata = list(o2.feed())
assert len(output) == 1
assert output[0].get_url() == "https://example.com"
def test_wrong_step_type(test_args, caplog):
args = test_args + [
"--feeders",
"example_extractor", # example_extractor is not a valid feeder!
]
orchestrator = ArchivingOrchestrator()
with pytest.raises(SetupError) as err:
orchestrator.setup(args)
assert "Module 'example_extractor' is not a feeder" in str(err.value)