diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 8985e0c..9dd3e06 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -317,9 +317,11 @@ class ArchivingOrchestrator: exit() return read_yaml(config_file) - - def run(self, args: list) -> Generator[Metadata]: - + + def setup(self, args: list): + """ + Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser + """ self.setup_basic_parser() # parse the known arguments for now (basically, we want the config file) @@ -342,8 +344,10 @@ class ArchivingOrchestrator: for module_type in BaseModule.MODULE_TYPES: logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) - for result in self.feed(): - yield result + def run(self, args: list) -> Generator[Metadata]: + + self.setup(args) + return self.feed() def cleanup(self) -> None: logger.info("Cleaning up") @@ -351,7 +355,7 @@ class ArchivingOrchestrator: e.cleanup() def feed(self) -> Generator[Metadata]: - + url_count = 0 for feeder in self.feeders: for item in feeder: diff --git a/tests/extractors/test_twitter_api_extractor.py b/tests/extractors/test_twitter_api_extractor.py index d9a8eb0..004376c 100644 --- a/tests/extractors/test_twitter_api_extractor.py +++ b/tests/extractors/test_twitter_api_extractor.py @@ -23,7 +23,6 @@ class TestTwitterApiExtractor(TestExtractorBase): } @pytest.mark.parametrize("url, expected", [ - ("https://t.co/yl3oOJatFp", "https://www.bellingcat.com/category/resources/"), # t.co URL ("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged ("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged ("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025) @@ -32,7 +31,11 @@ class TestTwitterApiExtractor(TestExtractorBase): ]) def test_sanitize_url(self, url, expected): assert expected == self.extractor.sanitize_url(url) - + + @pytest.mark.download + def test_sanitize_url_download(self): + assert "https://t.co/yl3oOJatFp" == self.extractor.sanitize_url("https://www.bellingcat.com/category/resources/") + @pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [ ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"), ("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),