From d6b4b7a932b7c8840265890583b79dc7e5038b47 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 30 Jan 2025 16:43:09 +0100 Subject: [PATCH] Further cleanup * Removes (partly) the ArchivingOrchestrator * Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com * More unit tests * Improved error handling --- src/auto_archiver/__main__.py | 2 +- src/auto_archiver/core/base_module.py | 100 +++++++++ src/auto_archiver/core/config.py | 33 ++- src/auto_archiver/core/context.py | 10 +- src/auto_archiver/core/extractor.py | 3 +- src/auto_archiver/core/module.py | 54 +---- src/auto_archiver/core/orchestrator.py | 200 ++++++++++++++---- .../enrichers/screenshot_enricher.py | 40 ++++ src/auto_archiver/feeders/csv_feeder.py | 38 ++++ .../modules/atlos_feeder/atlos_feeder.py | 2 - .../modules/cli_feeder/__init__.py | 1 - .../modules/cli_feeder/__manifest__.py | 27 --- .../modules/cli_feeder/cli_feeder.py | 15 -- .../modules/csv_feeder/__manifest__.py | 1 - .../modules/csv_feeder/csv_feeder.py | 4 +- .../generic_extractor/generic_extractor.py | 6 +- .../modules/html_formatter/html_formatter.py | 4 +- .../screenshot_enricher.py | 6 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../telethon_extractor/telethon_extractor.py | 4 +- .../thumbnail_enricher/thumbnail_enricher.py | 2 +- .../timestamping_enricher.py | 8 +- .../modules/vk_extractor/vk_extractor.py | 4 +- .../modules/wacz_enricher/wacz_enricher.py | 6 +- tests/__init__.py | 3 +- tests/conftest.py | 6 + tests/test_orchestrator.py | 27 ++- 27 files changed, 417 insertions(+), 191 deletions(-) create mode 100644 src/auto_archiver/core/base_module.py create mode 100644 src/auto_archiver/enrichers/screenshot_enricher.py create mode 100644 src/auto_archiver/feeders/csv_feeder.py delete mode 100644 src/auto_archiver/modules/cli_feeder/__init__.py delete mode 100644 src/auto_archiver/modules/cli_feeder/__manifest__.py delete mode 100644 src/auto_archiver/modules/cli_feeder/cli_feeder.py diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 0e2f54f..0023a59 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator import sys def main(): - ArchivingOrchestrator().run(sys.argv) + ArchivingOrchestrator().run(sys.argv[1:]) if __name__ == "__main__": main() diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py new file mode 100644 index 0000000..a9a904f --- /dev/null +++ b/src/auto_archiver/core/base_module.py @@ -0,0 +1,100 @@ + + +from urllib.parse import urlparse +from typing import Mapping, Any +from abc import ABC +from copy import deepcopy, copy +from tempfile import TemporaryDirectory + +from loguru import logger + +class BaseModule(ABC): + + """ + Base module class. All modules should inherit from this class. + + The exact methods a class implements will depend on the type of module it is, + however all modules have a .setup(config: dict) method to run any setup code + (e.g. logging in to a site, spinning up a browser etc.) + + See BaseModule.MODULE_TYPES for the types of modules you can create, noting that + a subclass can be of multiple types. For example, a module that extracts data from + a website and stores it in a database would be both an 'extractor' and a 'database' module. + + Each module is a python package, and should have a __manifest__.py file in the + same directory as the module file. The __manifest__.py specifies the module information + like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the + default manifest structure. + + """ + + MODULE_TYPES = [ + 'feeder', + 'extractor', + 'enricher', + 'database', + 'storage', + 'formatter' + ] + + _DEFAULT_MANIFEST = { + 'name': '', # the display name of the module + 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! + 'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES + 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare + 'description': '', # a description of the module + 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format + 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName + 'version': '1.0', # the version of the module + 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line +} + + config: Mapping[str, Any] + authentication: Mapping[str, Mapping[str, str]] + name: str + + # this is set by the orchestrator prior to archiving + tmp_dir: TemporaryDirectory = None + + def setup(self, config: dict): + + authentication = config.get('authentication', {}) + # extract out contatenated sites + for key, val in copy(authentication).items(): + if "," in key: + for site in key.split(","): + authentication[site] = val + del authentication[key] + + # this is important. Each instance is given its own deepcopied config, so modules cannot + # change values to affect other modules + config = deepcopy(config) + authentication = deepcopy(config.pop('authentication', {})) + + self.authentication = authentication + self.config = config + for key, val in config.get(self.name, {}).items(): + setattr(self, key, val) + + def repr(self): + return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" + + def auth_for_site(self, site: str) -> dict: + # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) + # for now, just hard code those. + + # SECURITY: parse the domain using urllib + site = urlparse(site).netloc + # add the 'www' version of the site to the list of sites to check + for to_try in [site, f"www.{site}"]: + if to_try in self.authentication: + return self.authentication[to_try] + + # do a fuzzy string match just to print a warning - don't use it since it's insecure + for key in self.authentication.keys(): + if key in site or site in key: + logger.warning(f"Could not find exact authentication information for site '{site}'. \ + did find information for '{key}' which is close, is this what you meant? \ + If so, edit your authentication settings to make sure it exactly matches.") + + return {} \ No newline at end of file diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index ca8ed25..2d462e4 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -15,8 +15,14 @@ from .module import BaseModule from typing import Any, List, Type, Tuple -yaml = YAML() +yaml: YAML = YAML() +b = yaml.load(""" + # This is a comment + site.com,site2.com: + key: value + key2: value2 + """) EMPTY_CONFIG = yaml.load(""" # Auto Archiver Configuration # Steps are the modules that will be run in the order they are defined @@ -25,6 +31,24 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES """ # Global configuration + +# Authentication +# a dictionary of authentication information that can be used by extractors to login to website. +# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com) +# Common login 'types' are username/password, cookie, api key/token. +# Some Examples: +# facebook.com: +# username: "my_username" +# password: "my_password" +# or for a site that uses an API key: +# twitter.com,x.com: +# api_key +# api_secret +# youtube.com: +# cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ; + +authentication: {} + # These are the global configurations that are used by the modules logging: @@ -136,12 +160,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap: # TODO: make this tidier/find a way to notify of which keys should not be stored -def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None: +def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save = deepcopy(config) - for key1, key2 in do_not_store_keys: - if key1 in config_to_save and key2 in config_to_save[key1]: - del config_to_save[key1][key2] - + config.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py index 9a21b5c..0db5359 100644 --- a/src/auto_archiver/core/context.py +++ b/src/auto_archiver/core/context.py @@ -53,12 +53,4 @@ class ArchivingContext: if full_reset: ac.keep_on_reset = set() ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset} - # ---- custom getters/setters for widely used context values - - @staticmethod - def set_tmp_dir(tmp_dir: str): - ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir - - @staticmethod - def get_tmp_dir() -> str: - return ArchivingContext.get_instance().configs.get("tmp_dir") + # ---- custom getters/setters for widely used context values \ No newline at end of file diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index ed261eb..b0d80bc 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -12,7 +12,6 @@ from dataclasses import dataclass import mimetypes import os import mimetypes - import requests from loguru import logger from retrying import retry @@ -71,7 +70,7 @@ class Extractor(BaseModule): to_filename = url.split('/')[-1].split('?')[0] if len(to_filename) > 64: to_filename = to_filename[-64:] - to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename) + to_filename = os.path.join(self.tmp_dir, to_filename) if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 4542b88..501f238 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -7,7 +7,6 @@ from __future__ import annotations from dataclasses import dataclass from typing import List -from abc import ABC import shutil import ast import copy @@ -17,63 +16,12 @@ import os from os.path import join, dirname from loguru import logger import auto_archiver +from .base_module import BaseModule _LAZY_LOADED_MODULES = {} MANIFEST_FILE = "__manifest__.py" -class BaseModule(ABC): - - """ - Base module class. All modules should inherit from this class. - - The exact methods a class implements will depend on the type of module it is, - however all modules have a .setup(config: dict) method to run any setup code - (e.g. logging in to a site, spinning up a browser etc.) - - See BaseModule.MODULE_TYPES for the types of modules you can create, noting that - a subclass can be of multiple types. For example, a module that extracts data from - a website and stores it in a database would be both an 'extractor' and a 'database' module. - - Each module is a python package, and should have a __manifest__.py file in the - same directory as the module file. The __manifest__.py specifies the module information - like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the - default manifest structure. - - """ - - MODULE_TYPES = [ - 'feeder', - 'extractor', - 'enricher', - 'database', - 'storage', - 'formatter' - ] - - _DEFAULT_MANIFEST = { - 'name': '', # the display name of the module - 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! - 'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES - 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare - 'description': '', # a description of the module - 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format - 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName - 'version': '1.0', # the version of the module - 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line -} - - config: dict - name: str - - def setup(self, config: dict): - self.config = config - for key, val in config.get(self.name, {}).items(): - setattr(self, key, val) - - def repr(self): - return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" - def setup_paths(paths: list[str]) -> None: """ diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ba46492..ad11849 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,12 +5,15 @@ """ from __future__ import annotations -from typing import Generator, Union, List +from typing import Generator, Union, List, Type from urllib.parse import urlparse from ipaddress import ip_address import argparse import os import sys +import json +from tempfile import TemporaryDirectory +import traceback from rich_argparse import RichHelpFormatter @@ -18,17 +21,46 @@ from .context import ArchivingContext from .metadata import Metadata from ..version import __version__ -from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser +from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths -from . import validators +from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .module import BaseModule -import tempfile, traceback from loguru import logger DEFAULT_CONFIG_FILE = "orchestration.yaml" +class JsonParseAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + try: + setattr(namespace, self.dest, json.loads(values)) + except json.JSONDecodeError as e: + raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}") + + +class AuthenticationJsonParseAction(JsonParseAction): + def __call__(self, parser, namespace, values, option_string=None): + super().__call__(parser, namespace, values, option_string) + auth_dict = getattr(namespace, self.dest) + if isinstance(auth_dict, str): + # if it's a string + try: + with open(auth_dict, 'r') as f: + try: + auth_dict = json.load(f) + except json.JSONDecodeError: + # maybe it's yaml, try that + auth_dict = yaml.load(f) + except: + pass + + if not isinstance(auth_dict, dict): + raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") + for site, auth in auth_dict.items(): + if not isinstance(site, str) or not isinstance(auth, dict): + raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") + setattr(namespace, self.dest, auth_dict) class UniqueAppendAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): if not hasattr(namespace, self.dest): @@ -38,9 +70,7 @@ class UniqueAppendAction(argparse.Action): getattr(namespace, self.dest).append(value) class ArchivingOrchestrator: - - _do_not_store_keys = [] - + def setup_basic_parser(self): parser = argparse.ArgumentParser( prog="auto-archiver", @@ -52,7 +82,7 @@ class ArchivingOrchestrator: epilog="Check the code at https://github.com/bellingcat/auto-archiver", formatter_class=RichHelpFormatter, ) - parser.add_argument('--help', '-h', action='store_true', dest='help', help='show this help message and exit') + parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit') parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') @@ -80,7 +110,6 @@ class ArchivingOrchestrator: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] - # first loads the modules from the config file, then from the command line for config in [yaml_config['steps'], basic_config.__dict__]: for module_type in BaseModule.MODULE_TYPES: @@ -120,7 +149,7 @@ class ArchivingOrchestrator: if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_yaml(self.config, basic_config.config_file, self._do_not_store_keys) + store_yaml(self.config, basic_config.config_file) return self.config @@ -128,18 +157,29 @@ class ArchivingOrchestrator: if not parser: parser = self.parser - parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction) + + # allow passing URLs directly on the command line + parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml') + + parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction) parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction) parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction) parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction) parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction) parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction) + parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \ + (token, username etc.) that extractors can use to log into \ + a website. If passing this on the command line, use a JSON string. \ + You may also pass a path to a valid JSON/YAML file which will be parsed.',\ + default={}, + action=AuthenticationJsonParseAction) # logging arguments parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO') parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) + def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: if not modules: @@ -147,6 +187,7 @@ class ArchivingOrchestrator: module: LazyBaseModule for module in modules: + if not module.configs: # this module has no configs, don't show anything in the help # (TODO: do we want to show something about this module though, like a description?) @@ -155,12 +196,6 @@ class ArchivingOrchestrator: group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") for name, kwargs in module.configs.items(): - # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set - # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something - do_not_store = kwargs.pop('do_not_store', False) - if do_not_store: - self._do_not_store_keys.append((module.name, name)) - if not kwargs.get('metavar', None): # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR] kwargs['metavar'] = name.upper() @@ -208,8 +243,7 @@ class ArchivingOrchestrator: step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] - assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} \ - in your configuration file or on the command line (using --{module_type}s)" + assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" def check_steps_ok(): if not len(step_items): @@ -223,12 +257,37 @@ class ArchivingOrchestrator: exit() for module in modules_to_load: + if module == 'cli_feeder': + urls = self.config['urls'] + if not urls: + logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.") + self.basic_parser.print_help() + exit() + # cli_feeder is a pseudo module, it just takes the command line args + def feed(self) -> Generator[Metadata]: + for url in urls: + logger.debug(f"Processing URL: '{url}'") + yield Metadata().set_url(url) + ArchivingContext.set("folder", "cli") + + pseudo_module = type('CLIFeeder', (Feeder,), { + 'name': 'cli_feeder', + 'display_name': 'CLI Feeder', + '__iter__': feed + + })() + + + pseudo_module.__iter__ = feed + step_items.append(pseudo_module) + continue + if module in invalid_modules: continue try: loaded_module: BaseModule = get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: - logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") + logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") if module_type == 'extractor' and loaded_module.name == module: loaded_module.cleanup() exit() @@ -285,13 +344,18 @@ class ArchivingOrchestrator: def cleanup(self)->None: logger.info("Cleaning up") - for e in self.config['steps']['extractors']: + for e in self.extractors: e.cleanup() def feed(self) -> Generator[Metadata]: - for feeder in self.config['steps']['feeders']: + + url_count = 0 + for feeder in self.feeders: for item in feeder: yield self.feed_item(item) + url_count += 1 + + logger.success(f"Processed {url_count} URL(s)") self.cleanup() def feed_item(self, item: Metadata) -> Metadata: @@ -300,22 +364,33 @@ class ArchivingOrchestrator: - catches keyboard interruptions to do a clean exit - catches any unexpected error, logs it, and does a clean exit """ + tmp_dir: TemporaryDirectory = None try: - ArchivingContext.reset() - with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - ArchivingContext.set_tmp_dir(tmp_dir) - return self.archive(item) + tmp_dir = TemporaryDirectory(dir="./") + # set tmp_dir on all modules + for m in self.all_modules: + m.tmp_dir = tmp_dir.name + return self.archive(item) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit logger.warning(f"caught interrupt on {item=}") - for d in self.config['steps']['databases']: d.aborted(item) + for d in self.databases: + d.aborted(item) self.cleanup() exit() except Exception as e: logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') - for d in self.config['steps']['databases']: - if type(e) == AssertionError: d.failed(item, str(e)) - else: d.failed(item, reason="unexpected error") + for d in self.databases: + if type(e) == AssertionError: + d.failed(item, str(e)) + else: + d.failed(item, reason="unexpected error") + finally: + if tmp_dir: + # remove the tmp_dir from all modules + for m in self.all_modules: + m.tmp_dir = None + tmp_dir.cleanup() def archive(self, result: Metadata) -> Union[Metadata, None]: @@ -328,31 +403,38 @@ class ArchivingOrchestrator: 5. Store all downloaded/generated media 6. Call selected Formatter and store formatted if needed """ + original_url = result.get_url().strip() - self.assert_valid_url(original_url) + try: + self.assert_valid_url(original_url) + except AssertionError as e: + logger.error(f"Error archiving URL {original_url}: {e}") + raise e # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs url = original_url - for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url) + for a in self.extractors: + url = a.sanitize_url(url) + result.set_url(url) if original_url != url: result.set("original_url", original_url) # 2 - notify start to DBs, propagate already archived if feature enabled in DBs cached_result = None - for d in self.config["steps"]["databases"]: + for d in self.databases: d.started(result) if (local_result := d.fetch(result)): cached_result = (cached_result or Metadata()).merge(local_result) if cached_result: logger.debug("Found previously archived entry") - for d in self.config["steps"]["databases"]: + for d in self.databases: try: d.done(cached_result, cached=True) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") return cached_result # 3 - call extractors until one succeeds - for a in self.config["steps"]["extractors"]: + for a in self.extractors: logger.info(f"Trying extractor {a.name} for {url}") try: result.merge(a.download(result)) @@ -361,7 +443,7 @@ class ArchivingOrchestrator: logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}") # 4 - call enrichers to work with archived content - for e in self.config["steps"]["enrichers"]: + for e in self.enrichers: try: e.enrich(result) except Exception as exc: logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") @@ -370,7 +452,7 @@ class ArchivingOrchestrator: result.store() # 6 - format and store formatted if needed - if final_media := self.config["steps"]["formatters"][0].format(result): + if final_media := self.formatters[0].format(result): final_media.store(url=url, metadata=result) result.set_final_media(final_media) @@ -378,7 +460,7 @@ class ArchivingOrchestrator: result.status = "nothing archived" # signal completion to databases and archivers - for d in self.config["steps"]["databases"]: + for d in self.databases: try: d.done(result) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") @@ -403,4 +485,44 @@ class ArchivingOrchestrator: assert ip.is_global, f"Invalid IP used" assert not ip.is_reserved, f"Invalid IP used" assert not ip.is_link_local, f"Invalid IP used" - assert not ip.is_private, f"Invalid IP used" \ No newline at end of file + assert not ip.is_private, f"Invalid IP used" + + + # Helper Properties + + @property + def feeders(self) -> List[Type[Feeder]]: + return self._get_property('feeders') + + @property + def extractors(self) -> List[Type[Extractor]]: + return self._get_property('extractors') + + @property + def enrichers(self) -> List[Type[Enricher]]: + return self._get_property('enrichers') + + @property + def databases(self) -> List[Type[Database]]: + return self._get_property('databases') + + @property + def storages(self) -> List[Type[Storage]]: + return self._get_property('storages') + + @property + def formatters(self) -> List[Type[Formatter]]: + return self._get_property('formatters') + + @property + def all_modules(self) -> List[Type[BaseModule]]: + return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters + + def _get_property(self, prop): + try: + f = self.config['steps'][prop] + if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)): + raise TypeError + return f + except: + exit("Property called prior to full initialisation") \ No newline at end of file diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py new file mode 100644 index 0000000..0d05d92 --- /dev/null +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -0,0 +1,40 @@ +from loguru import logger +import time, os +from selenium.common.exceptions import TimeoutException + + +from auto_archiver.core import Enricher +from ..utils import Webdriver, UrlUtil, random_str +from ..core import Media, Metadata + +class ScreenshotEnricher(Enricher): + name = "screenshot_enricher" + + @staticmethod + def configs() -> dict: + return { + "width": {"default": 1280, "help": "width of the screenshots"}, + "height": {"default": 720, "help": "height of the screenshots"}, + "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, + "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, + "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + } + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): + logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") + return + + logger.debug(f"Enriching screenshot for {url=}") + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: + try: + driver.get(url) + time.sleep(int(self.sleep_before_screenshot)) + screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png") + driver.save_screenshot(screenshot_file) + to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") + except TimeoutException: + logger.info("TimeoutException loading page for screenshot") + except Exception as e: + logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/feeders/csv_feeder.py new file mode 100644 index 0000000..e9da518 --- /dev/null +++ b/src/auto_archiver/feeders/csv_feeder.py @@ -0,0 +1,38 @@ +from loguru import logger +import csv + +from . import Feeder +from ..core import Metadata, ArchivingContext +from ..utils import url_or_none + +class CSVFeeder(Feeder): + + @staticmethod + def configs() -> dict: + return { + "files": { + "default": None, + "help": "Path to the input file(s) to read the URLs from, comma separated. \ + Input files should be formatted with one URL per line", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + }, + "column": { + "default": None, + "help": "Column number or name to read the URLs from, 0-indexed", + } + } + + + def __iter__(self) -> Metadata: + url_column = self.column or 0 + for file in self.files: + with open(file, "r") as f: + reader = csv.reader(f) + first_row = next(reader) + if not(url_or_none(first_row[url_column])): + # it's a header row, skip it + for row in reader: + url = row[0] + logger.debug(f"Processing {url}") + yield Metadata().set_url(url) + ArchivingContext.set("folder", "cli") \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index bbf06f6..8c8f9cb 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -40,5 +40,3 @@ class AtlosFeeder(Feeder): if len(data["results"]) == 0 or cursor is None: break - - logger.success(f"Processed {count} URL(s)") diff --git a/src/auto_archiver/modules/cli_feeder/__init__.py b/src/auto_archiver/modules/cli_feeder/__init__.py deleted file mode 100644 index 9c85787..0000000 --- a/src/auto_archiver/modules/cli_feeder/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .cli_feeder import CLIFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py deleted file mode 100644 index cf5c1b7..0000000 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "CLI Feeder", - "type": ["feeder"], - "requires_setup": False, - "dependencies": { - "python": ["loguru"], - }, - 'entry_point': 'cli_feeder::CLIFeeder', - "configs": { - "urls": { - "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "nargs": "+", - "required": True, - "do_not_store": True, - "metavar": "INPUT URLS", - }, - }, - "description": """ - Processes URLs to archive passed via the command line and feeds them into the archiving pipeline. - - ### Features - - Takes a single URL or a list of URLs provided via the command line. - - Converts each URL into a `Metadata` object and yields it for processing. - - Ensures URLs are processed only if they are explicitly provided. - - """ -} diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py deleted file mode 100644 index 62cb659..0000000 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ /dev/null @@ -1,15 +0,0 @@ -from loguru import logger - -from auto_archiver.core import Feeder -from auto_archiver.core import Metadata, ArchivingContext - - -class CLIFeeder(Feeder): - - def __iter__(self) -> Metadata: - for url in self.urls: - logger.debug(f"Processing URL: '{url}'") - yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") - - logger.success(f"Processed {len(self.urls)} URL(s)") diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index b062ee6..7249395 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -26,7 +26,6 @@ - Supports reading URLs from multiple input files, specified as a comma-separated list. - Allows specifying the column number or name to extract URLs from. - Skips header rows if the first value is not a valid URL. - - Integrates with the `ArchivingContext` to manage URL feeding. ### Setu N - Input files should be formatted with one URL per line. diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index ad0a035..1cd9022 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -20,6 +20,4 @@ class CSVFeeder(Feeder): url = row[0] logger.debug(f"Processing {url}") yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") - - logger.success(f"Processed {len(self.urls)} URL(s)") \ No newline at end of file + ArchivingContext.set("folder", "cli") \ No newline at end of file diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index e643c21..2879c05 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -270,7 +270,11 @@ class GenericExtractor(Extractor): logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie - ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} + ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), + 'quiet': False, 'noplaylist': not self.allow_playlist , + 'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles, + "live_from_start": self.live_from_start, "proxy": self.proxy, + "max_downloads": self.max_downloads, "playlistend": self.max_downloads} if item.netloc in ['youtube.com', 'www.youtube.com']: if self.cookies_from_browser: diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index bfc2efa..4da82c8 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -7,7 +7,7 @@ import json import base64 from auto_archiver.version import __version__ -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.core import Formatter from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str @@ -46,7 +46,7 @@ class HtmlFormatter(Formatter): version=__version__ ) - html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html") + html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html") with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) final_media = Media(filename=html_path, _mimetype="text/html") diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index be775ce..8e7639a 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -7,7 +7,7 @@ from selenium.common.exceptions import TimeoutException from auto_archiver.core import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str -from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.core import Media, Metadata class ScreenshotEnricher(Enricher): @@ -23,11 +23,11 @@ class ScreenshotEnricher(Enricher): try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) - screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") + screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png") driver.save_screenshot(screenshot_file) to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") if self.save_to_pdf: - pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf") + pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf") pdf = driver.print_page(driver.print_options) with open(pdf_file, "wb") as f: f.write(base64.b64decode(pdf)) diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 52237ee..76784fa 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -23,6 +23,6 @@ class SSLEnricher(Enricher): logger.debug(f"fetching SSL certificate for {domain=} in {url=}") cert = ssl.get_server_certificate((domain, 443)) - cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem") + cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem") with open(cert_fn, "w") as f: f.write(cert) to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate") diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 8a08954..3e952e8 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -9,7 +9,7 @@ from tqdm import tqdm import re, time, json, os from auto_archiver.core import Extractor -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str @@ -120,7 +120,7 @@ class TelethonArchiver(Extractor): media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index b27243b..429ba38 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher): logger.debug(f"generating thumbnails for {to_enrich.get_url()}") for m_id, m in enumerate(to_enrich.media[::]): if m.is_video(): - folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24)) + folder = os.path.join(self.tmp_dir, random_str(24)) os.makedirs(folder, exist_ok=True) logger.debug(f"generating thumbnails for {m.filename}") duration = m.get("duration") diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index a7a0aee..078c1ba 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -9,9 +9,7 @@ from asn1crypto import pem import certifi from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.core import Extractor - +from auto_archiver.core import Metadata, Media class TimestampingEnricher(Enricher): """ @@ -33,7 +31,7 @@ class TimestampingEnricher(Enricher): logger.warning(f"No hashes found in {url=}") return - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir hashes_fn = os.path.join(tmp_dir, "hashes.txt") data_to_sign = "\n".join(hashes) @@ -93,7 +91,7 @@ class TimestampingEnricher(Enricher): cert_chain = [] for cert in path: - cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt") + cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt") with open(cert_fn, "wb") as f: f.write(cert.dump()) cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"])) diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 301fa89..2d09138 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload from auto_archiver.core import Extractor -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media class VkExtractor(Extractor): @@ -35,7 +35,7 @@ class VkExtractor(Extractor): result.set_content(dump_payload(vk_scrapes)) - filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir()) + filenames = self.vks.download_media(vk_scrapes, self.tmp_dir) for filename in filenames: result.add_media(Media(filename)) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 8810b84..3f67b7c 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -5,7 +5,7 @@ from zipfile import ZipFile from loguru import logger from warcio.archiveiterator import ArchiveIterator -from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.core import Media, Metadata from auto_archiver.core import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str @@ -51,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor): url = to_enrich.get_url() collection = random_str(8) - browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir()) + browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir) browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host cmd = [ @@ -154,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor): logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}") # unzipping the .wacz - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir unzipped_dir = os.path.join(tmp_dir, "unzipped") with ZipFile(wacz_filename, 'r') as z_obj: z_obj.extractall(path=unzipped_dir) diff --git a/tests/__init__.py b/tests/__init__.py index 3d66aff..31f38cb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,5 +2,4 @@ import tempfile from auto_archiver.core.context import ArchivingContext -ArchivingContext.reset(full_reset=True) -ArchivingContext.set_tmp_dir(tempfile.gettempdir()) \ No newline at end of file +ArchivingContext.reset(full_reset=True) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index af0fd6d..3bd382b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ pytest conftest file, for shared fixtures and configuration """ +from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib import pytest @@ -25,8 +26,13 @@ def setup_module(request): m = get_module(module_name, {module_name: config}) + # add the tmp_dir to the module + tmp_dir = TemporaryDirectory() + m.tmp_dir = tmp_dir + def cleanup(): _LAZY_LOADED_MODULES.pop(module_name) + tmp_dir.cleanup() request.addfinalizer(cleanup) return m diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 03cb521..68417aa 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -1,6 +1,6 @@ import pytest import sys -from argparse import ArgumentParser +from argparse import ArgumentParser, ArgumentTypeError from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.version import __version__ from auto_archiver.core.config import read_yaml, store_yaml @@ -113,16 +113,23 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path): # run the orchestrator orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES]) + assert orchestrator.config is not None - # should run OK, since there are no missing required fields +def test_load_authentication_string(orchestrator, test_args): - # basic_args = basic_parser.parse_known_args(test_args) - # test_yaml = read_yaml(TEST_ORCHESTRATION) - # test_yaml['example_module'] = {'required_field': 'some_value'} + orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']) + assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}} - # # monkey patch the example_module to have a 'configs' setting of 'my_var' with required=True - # # load the module first - # m = get_module_lazy("example_module") +def test_load_authentication_string_concat_site(orchestrator, test_args): + + orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}']) + assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"}, + "twitter.com": {"api_key": "my_key"}} - # orchestrator.setup_complete_parser(basic_args, test_yaml, unused_args=[]) - # assert orchestrator.config is not None \ No newline at end of file +def test_load_invalid_authentication_string(orchestrator, test_args): + with pytest.raises(ArgumentTypeError): + orchestrator.run(test_args + ["--authentication", "{\''invalid_json"]) + +def test_load_authentication_invalid_dict(orchestrator, test_args): + with pytest.raises(ArgumentTypeError): + orchestrator.run(test_args + ["--authentication", "[true, false]"]) \ No newline at end of file