kopia lustrzana https://github.com/bellingcat/auto-archiver
makes orchestrator.run return the results to allow for code integration (#196)
rodzic
5614af3f63
commit
9297697ef5
|
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "0.13.1"
|
||||
version = "0.13.2"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
|
|
|
@ -30,6 +30,7 @@ from loguru import logger
|
|||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
|
||||
|
||||
class JsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
try:
|
||||
|
@ -60,6 +61,8 @@ class AuthenticationJsonParseAction(JsonParseAction):
|
|||
if not isinstance(site, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
if not hasattr(namespace, self.dest):
|
||||
|
@ -68,6 +71,7 @@ class UniqueAppendAction(argparse.Action):
|
|||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
feeders: List[Type[Feeder]]
|
||||
|
@ -166,7 +170,6 @@ class ArchivingOrchestrator:
|
|||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
|
||||
# allow passing URLs directly on the command line
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
|
||||
|
@ -180,7 +183,7 @@ class ArchivingOrchestrator:
|
|||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',\
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||
default={},
|
||||
action=AuthenticationJsonParseAction)
|
||||
# logging arguments
|
||||
|
@ -188,7 +191,6 @@ class ArchivingOrchestrator:
|
|||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
|
||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
|
||||
if not modules:
|
||||
|
@ -273,6 +275,7 @@ class ArchivingOrchestrator:
|
|||
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
exit()
|
||||
# cli_feeder is a pseudo module, it just takes the command line args
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
|
@ -285,7 +288,6 @@ class ArchivingOrchestrator:
|
|||
|
||||
})()
|
||||
|
||||
|
||||
pseudo_module.__iter__ = feed
|
||||
step_items.append(pseudo_module)
|
||||
continue
|
||||
|
@ -316,7 +318,7 @@ class ArchivingOrchestrator:
|
|||
|
||||
return read_yaml(config_file)
|
||||
|
||||
def run(self, args: list) -> None:
|
||||
def run(self, args: list) -> Generator[Metadata]:
|
||||
|
||||
self.setup_basic_parser()
|
||||
|
||||
|
@ -340,8 +342,8 @@ class ArchivingOrchestrator:
|
|||
for module_type in BaseModule.MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
for _ in self.feed():
|
||||
pass
|
||||
for result in self.feed():
|
||||
yield result
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info("Cleaning up")
|
||||
|
@ -393,7 +395,6 @@ class ArchivingOrchestrator:
|
|||
m.tmp_dir = None
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
"""
|
||||
Runs the archiving process for a single URL
|
||||
|
@ -489,7 +490,6 @@ class ArchivingOrchestrator:
|
|||
assert not ip.is_link_local, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@property
|
||||
|
|
Ładowanie…
Reference in New Issue