kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge pull request #263 from bellingcat/wrong_steps
When loading modules, check they have been added to the right 'step' in the configscoop
commit
b997bbea2b
|
@ -85,7 +85,11 @@ class ModuleFactory:
|
||||||
if not available:
|
if not available:
|
||||||
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
||||||
if "archiver" in module_name:
|
if "archiver" in module_name:
|
||||||
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
message += f" Did you mean '{module_name.replace('archiver', 'extractor')}'?"
|
||||||
|
elif "gsheet" in module_name:
|
||||||
|
message += " Did you mean 'gsheet_feeder_db'?"
|
||||||
|
elif "atlos" in module_name:
|
||||||
|
message += " Did you mean 'atlos_feeder_db_storage'?"
|
||||||
raise IndexError(message)
|
raise IndexError(message)
|
||||||
return available[0]
|
return available[0]
|
||||||
|
|
||||||
|
|
|
@ -373,9 +373,17 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||||
if module in invalid_modules:
|
if module in invalid_modules:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# check to make sure that we're trying to load it as the correct type - i.e. make sure the user hasn't put it under the wrong 'step'
|
||||||
|
lazy_module: LazyBaseModule = self.module_factory.get_module_lazy(module)
|
||||||
|
if module_type not in lazy_module.type:
|
||||||
|
types = ",".join(f"'{t}'" for t in lazy_module.type)
|
||||||
|
raise SetupError(
|
||||||
|
f"Configuration Error: Module '{module}' is not a {module_type}, but has the types: {types}. Please check you set this module up under the right step in your orchestration file."
|
||||||
|
)
|
||||||
|
|
||||||
loaded_module = None
|
loaded_module = None
|
||||||
try:
|
try:
|
||||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
loaded_module: BaseModule = lazy_module.load(self.config)
|
||||||
except (KeyboardInterrupt, Exception) as e:
|
except (KeyboardInterrupt, Exception) as e:
|
||||||
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
|
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
|
||||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import re
|
import re
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import json
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
@ -32,6 +31,9 @@ class Twitter(GenericDropin):
|
||||||
twid = ie_instance._match_valid_url(url).group("id")
|
twid = ie_instance._match_valid_url(url).group("id")
|
||||||
return ie_instance._extract_status(twid=twid)
|
return ie_instance._extract_status(twid=twid)
|
||||||
|
|
||||||
|
def keys_to_clean(self, video_data, info_extractor):
|
||||||
|
return ["user", "created_at", "entities", "favorited", "translator_type"]
|
||||||
|
|
||||||
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
try:
|
try:
|
||||||
|
@ -42,9 +44,11 @@ class Twitter(GenericDropin):
|
||||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
|
full_text = tweet.pop("full_text", "")
|
||||||
timestamp
|
author = tweet["user"].get("name", "")
|
||||||
)
|
result.set("author", author).set_url(url)
|
||||||
|
|
||||||
|
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
|
||||||
if not tweet.get("entities", {}).get("media"):
|
if not tweet.get("entities", {}).get("media"):
|
||||||
logger.debug("No media found, archiving tweet text only")
|
logger.debug("No media found, archiving tweet text only")
|
||||||
result.status = "twitter-ytdl"
|
result.status = "twitter-ytdl"
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
{
|
||||||
|
# Display Name of your module
|
||||||
|
"name": "Example Extractor",
|
||||||
|
# Optional version number, for your own versioning purposes
|
||||||
|
"version": 2.0,
|
||||||
|
# The type of the module, must be one (or more) of the built in module types
|
||||||
|
"type": ["extractor"],
|
||||||
|
# a boolean indicating whether or not a module requires additional user setup before it can be used
|
||||||
|
# for example: adding API keys, installing additional software etc.
|
||||||
|
"requires_setup": False,
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
from auto_archiver.core import Extractor
|
||||||
|
|
||||||
|
|
||||||
|
class ExampleExtractor(Extractor):
|
||||||
|
def download(self, item):
|
||||||
|
print("download")
|
|
@ -206,10 +206,11 @@ class TestGenericExtractor(TestExtractorBase):
|
||||||
|
|
||||||
self.assertValidResponseMetadata(
|
self.assertValidResponseMetadata(
|
||||||
post,
|
post,
|
||||||
"Onion rings are just vegetable donuts.",
|
"Cookie Monster - Onion rings are just vegetable donuts.",
|
||||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||||
"yt-dlp_Twitter: success",
|
"yt-dlp_Twitter: success",
|
||||||
)
|
)
|
||||||
|
assert post.get("content") == "Onion rings are just vegetable donuts."
|
||||||
|
|
||||||
@pytest.mark.download
|
@pytest.mark.download
|
||||||
def test_twitter_download_video(self, make_item):
|
def test_twitter_download_video(self, make_item):
|
||||||
|
|
|
@ -4,6 +4,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||||
from auto_archiver.version import __version__
|
from auto_archiver.version import __version__
|
||||||
from auto_archiver.core.config import read_yaml, store_yaml
|
from auto_archiver.core.config import read_yaml, store_yaml
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
|
from auto_archiver.core.consts import SetupError
|
||||||
|
|
||||||
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
|
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
|
||||||
TEST_MODULES = "tests/data/test_modules/"
|
TEST_MODULES = "tests/data/test_modules/"
|
||||||
|
@ -224,3 +225,15 @@ def test_multiple_orchestrator(test_args):
|
||||||
output: Metadata = list(o2.feed())
|
output: Metadata = list(o2.feed())
|
||||||
assert len(output) == 1
|
assert len(output) == 1
|
||||||
assert output[0].get_url() == "https://example.com"
|
assert output[0].get_url() == "https://example.com"
|
||||||
|
|
||||||
|
|
||||||
|
def test_wrong_step_type(test_args, caplog):
|
||||||
|
args = test_args + [
|
||||||
|
"--feeders",
|
||||||
|
"example_extractor", # example_extractor is not a valid feeder!
|
||||||
|
]
|
||||||
|
|
||||||
|
orchestrator = ArchivingOrchestrator()
|
||||||
|
with pytest.raises(SetupError) as err:
|
||||||
|
orchestrator.setup(args)
|
||||||
|
assert "Module 'example_extractor' is not a feeder" in str(err.value)
|
||||||
|
|
Ładowanie…
Reference in New Issue