From 3c543a3a6a1d49fdc01337593756006c6515d9aa Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 18 Feb 2025 19:10:09 +0000 Subject: [PATCH] Various fixes for issues with new architecture (#208) * Add formatters to the TOC - fixes #204 * Add 'steps' settings to the example YAML in the docs. Fixes #206 * Improved docs on authentication architecture * Fix setting modules on the command line - they now override any module settings in the orchestration as opposed to appending * Fix tests for gsheet-feeder: add a test service_account.json (note: not real keys in there) * Rename the command line entrypoint to _command_line_run Also: make it clear that code implementation should not call this Make sure the command line entry returns (we don't want a generator) * Fix unit tests to use now code-entry points * Version bump * Move iterating of generator up to __main__ * Breakpoint * two minor fixes * Fix unit tests + add new '__main__' entry point implementation test * Skip youtube tests if running on CI. Should still run them locally * Fix full implementation run on GH actions * Fix skipif test for GH Actions CI * Add skipifs for truth - it blocks GH: --------- Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com> --- docs/scripts/scripts.py | 35 ++++- docs/source/conf.py | 3 + docs/source/core_modules.md | 1 + docs/source/how_to.md | 7 + docs/source/how_to/authentication.md | 57 +++++++ pyproject.toml | 2 +- src/auto_archiver/__main__.py | 2 +- src/auto_archiver/core/base_module.py | 29 ++-- src/auto_archiver/core/config.py | 11 +- src/auto_archiver/core/orchestrator.py | 141 ++++++++++++------ .../modules/gsheet_feeder/__manifest__.py | 2 +- .../templates/html_template.html | 2 +- .../screenshot_enricher/__manifest__.py | 1 - tests/data/test_service_account.json | 14 ++ tests/extractors/test_generic_extractor.py | 9 +- .../extractors/test_twitter_api_extractor.py | 2 +- tests/test_implementation.py | 12 ++ tests/test_orchestrator.py | 68 +++++++-- 18 files changed, 314 insertions(+), 84 deletions(-) create mode 100644 docs/source/how_to/authentication.md create mode 100644 tests/data/test_service_account.json diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index f73315b..9712439 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -19,6 +19,19 @@ type_color = { TABLE_HEADER = ("Option", "Description", "Default", "Type") +EXAMPLE_YAML = """ +# steps configuration +steps: +... +{steps_str} +... + +# module configuration +... + +{config_string} +""" + def generate_module_docs(): yaml = YAML() SAVE_FOLDER.mkdir(exist_ok=True) @@ -45,11 +58,14 @@ def generate_module_docs(): ``` {description} """ + steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest['type']) + if not manifest['configs']: - readme_str += "\n*This module has no configuration options.*\n" + config_string = f"# No configuration options for {module.name}.*\n" else: - config_yaml = {} + config_table = header_row + config_yaml = {} for key, value in manifest['configs'].items(): type = value.get('type', 'string') if type == 'auto_archiver.utils.json_loader': @@ -65,11 +81,14 @@ def generate_module_docs(): configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n" readme_str += "\n## Configuration Options\n" readme_str += "\n### YAML\n" - yaml_string = io.BytesIO() - yaml.dump({module.name: config_yaml}, yaml_string) - - readme_str += f"```{{code}} yaml\n{yaml_string.getvalue().decode('utf-8')}\n```\n" + config_string = io.BytesIO() + yaml.dump({module.name: config_yaml}, config_string) + config_string = config_string.getvalue().decode('utf-8') + yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string) + readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n" + + if manifest['configs']: readme_str += "\n### Command Line:\n" readme_str += config_table @@ -103,3 +122,7 @@ def generate_index(modules_by_type): with open(SAVE_FOLDER / "module_list.md", "w") as f: print("writing", SAVE_FOLDER / "module_list.md") f.write(readme_str) + + +if __name__ == "__main__": + generate_module_docs() \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index e7093c4..5b1ad9b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,3 +77,6 @@ html_theme = 'sphinx_book_theme' html_static_path = ["../_static"] html_css_files = ["custom.css"] +copybutton_prompt_text = r">>> |\.\.\." +copybutton_prompt_is_regexp = True +copybutton_only_copy_prompt_lines = False \ No newline at end of file diff --git a/docs/source/core_modules.md b/docs/source/core_modules.md index 4ee3bfc..3a8e5ec 100644 --- a/docs/source/core_modules.md +++ b/docs/source/core_modules.md @@ -24,4 +24,5 @@ modules/extractor modules/enricher modules/storage modules/database +modules/formatter ``` \ No newline at end of file diff --git a/docs/source/how_to.md b/docs/source/how_to.md index bf3b9fc..25e1e1d 100644 --- a/docs/source/how_to.md +++ b/docs/source/how_to.md @@ -45,3 +45,10 @@ The "archive location" link contains the path of the archived file, in local sto ![The archive result for a link in the demo sheet.](../demo-archive.png) --- + +```{toctree} +:maxdepth: 1 +:glob: + +how_to/* +``` \ No newline at end of file diff --git a/docs/source/how_to/authentication.md b/docs/source/how_to/authentication.md new file mode 100644 index 0000000..5f3bc48 --- /dev/null +++ b/docs/source/how_to/authentication.md @@ -0,0 +1,57 @@ +# Authentication + +The Authentication framework for auto-archiver allows you to add login details for various websites in a flexible way, directly from the configuration file. + +There are two main use cases for authentication: +* Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc. +* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this. + +## The Authentication Config + +You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. + +```{code} yaml +authentication: + # optional file to load authentication information from, for security or multi-system deploy purposes + load_from_file: path/to/authentication/file.txt + # optional setting to load cookies from the named browser on the system. + cookies_from_browser: firefox + # optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these + cookies_file: path/to/cookies.jar + + twitter.com,x.com: + username: myusername + password: 123 + + facebook.com: + cookie: single_cookie + + othersite.com: + api_key: 123 + api_secret: 1234 + +# All available options: + # - username: str - the username to use for login + # - password: str - the password to use for login + # - api_key: str - the API key to use for login + # - api_secret: str - the API secret to use for login + # - cookie: str - a cookie string to use for login (specific to this site) +``` + +### Recommendations for authentication + +1. **Store authentication information separately:** +The authentication part of your configuration contains sensitive information. You should make efforts not to share this with others. For extra security, use the `load_from_file` option to keep your authentication settings out of your configuration file, ideally in a different folder. + +2. **Don't use your own personal credentials** +Depending on the website you are extracting information from, there may be rules (Terms of Service) that prohibit you from scraping or extracting information using a bot. If you use your own personal account, there's a possibility it might get blocked/disabled. It's recommended to set up a separate, 'throwaway' account. In that way, if it gets blocked you can easily create another one to continue your archiving. + + +### How to create a cookies.jar or pass cookies directly to auto-archiver + +auto-archiver uses yt-dlp's powerful cookies features under the hood. For instructions on how to extract a cookies.jar (or cookies.txt) file directly from your browser, see the FAQ in the [yt-dlp documentation](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp) + +```{note} For developers: + +For information on how to access and use authentication settings from within your module, see the `{generic_extractor}` for an example, or view the [`auth_for_site()` function in BaseModule](../autoapi/core/base_module/index.rst) +``` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index cd76b59..9823833 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "0.13.2" +version = "0.13.3" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 0023a59..f901d21 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator import sys def main(): - ArchivingOrchestrator().run(sys.argv[1:]) + for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass if __name__ == "__main__": main() diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index ece4719..dfdd5ad 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -63,12 +63,6 @@ class BaseModule(ABC): def config_setup(self, config: dict): authentication = config.get('authentication', {}) - # extract out concatenated sites - for key, val in copy(authentication).items(): - if "," in key: - for site in key.split(","): - authentication[site] = val - del authentication[key] # this is important. Each instance is given its own deepcopied config, so modules cannot # change values to affect other modules @@ -89,16 +83,21 @@ class BaseModule(ABC): Returns the authentication information for a given site. This is used to authenticate with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com' - extract_cookies: bool - whether or not to extract cookies from the given browser and return the - cookie jar (disabling can speed up) processing if you don't actually need the cookies jar + :param site: the domain of the site to get authentication information for + :param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar). - Currently, the dict can have keys of the following types: - - username: str - the username to use for login - - password: str - the password to use for login - - api_key: str - the API key to use for login - - api_secret: str - the API secret to use for login - - cookie: str - a cookie string to use for login (specific to this site) - - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`) + :returns: authdict dict of login information for the given site + + **Global options:**\n + * cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n + * cookies_file: str - the path to a cookies file to use for login\n + + **Currently, the sites dict can have keys of the following types:**\n + * username: str - the username to use for login\n + * password: str - the password to use for login\n + * api_key: str - the API key to use for login\n + * api_secret: str - the API secret to use for login\n + * cookie: str - a cookie string to use for login (specific to this site)\n """ # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code? diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index c2d38ee..322ef6e 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -129,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap: yaml_subdict[key] = value continue + if key == 'steps': + for module_type, modules in value.items(): + # overwrite the 'steps' from the config file with the ones from the CLI + yaml_subdict[key][module_type] = modules + if is_dict_type(value): update_dict(value, yaml_subdict[key]) elif is_list_type(value): @@ -137,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap: yaml_subdict[key] = value update_dict(from_dot_notation(dotdict), yaml_dict) - return yaml_dict def read_yaml(yaml_filename: str) -> CommentedMap: @@ -159,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap: def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save = deepcopy(config) + auth_dict = config_to_save.get("authentication", {}) + if auth_dict and auth_dict.get('load_from_file'): + # remove all other values from the config, don't want to store it in the config file + auth_dict = {"load_from_file": auth_dict["load_from_file"]} + config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: _yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 9dd3e06..208512a 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -8,6 +8,7 @@ from __future__ import annotations from typing import Generator, Union, List, Type from urllib.parse import urlparse from ipaddress import ip_address +from copy import copy import argparse import os import sys @@ -43,30 +44,50 @@ class AuthenticationJsonParseAction(JsonParseAction): def __call__(self, parser, namespace, values, option_string=None): super().__call__(parser, namespace, values, option_string) auth_dict = getattr(namespace, self.dest) - if isinstance(auth_dict, str): - # if it's a string + + def load_from_file(path): try: - with open(auth_dict, 'r') as f: + with open(path, 'r') as f: try: auth_dict = json.load(f) except json.JSONDecodeError: + f.seek(0) # maybe it's yaml, try that auth_dict = _yaml.load(f) + if auth_dict.get('authentication'): + auth_dict = auth_dict['authentication'] + auth_dict['load_from_file'] = path + return auth_dict except: - pass + return None + if isinstance(auth_dict, dict) and auth_dict.get('from_file'): + auth_dict = load_from_file(auth_dict['from_file']) + elif isinstance(auth_dict, str): + # if it's a string + auth_dict = load_from_file(auth_dict) + if not isinstance(auth_dict, dict): raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") - for site, auth in auth_dict.items(): - if not isinstance(site, str) or not isinstance(auth, dict): - raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") + global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file'] + for key, auth in auth_dict.items(): + if key in global_options: + continue + if not isinstance(key, str) or not isinstance(auth, dict): + raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}") + + # extract out concatenated sites + for key, val in copy(auth_dict).items(): + if "," in key: + for site in key.split(","): + auth_dict[site] = val + del auth_dict[key] + setattr(namespace, self.dest, auth_dict) class UniqueAppendAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): - if not hasattr(namespace, self.dest): - setattr(namespace, self.dest, []) for value in values: if value not in getattr(namespace, self.dest): getattr(namespace, self.dest).append(value) @@ -104,36 +125,50 @@ class ArchivingOrchestrator: return parser def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None: + + + # modules parser to get the overridden 'steps' values + modules_parser = argparse.ArgumentParser( + add_help=False, + ) + self.add_modules_args(modules_parser) + cli_modules, unused_args = modules_parser.parse_known_args(unused_args) + for module_type in BaseModule.MODULE_TYPES: + yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", []) + parser = DefaultValidatingParser( add_help=False, ) self.add_additional_args(parser) + # merge command line module args (--feeders, --enrichers etc.) and add them to the config + # check what mode we're in # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser' # but should we add them? Or should we just add them to the 'complete' parser? + if yaml_config != EMPTY_CONFIG: # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] # first loads the modules from the config file, then from the command line - for config in [yaml_config['steps'], basic_config.__dict__]: - for module_type in BaseModule.MODULE_TYPES: - enabled_modules.extend(config.get(f"{module_type}s", [])) + for module_type in BaseModule.MODULE_TYPES: + enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) # clear out duplicates, but keep the order enabled_modules = list(dict.fromkeys(enabled_modules)) avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True) - self.add_module_args(avail_modules, parser) + self.add_individual_module_args(avail_modules, parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] - self.add_module_args(simple_modules, parser) + self.add_individual_module_args(simple_modules, parser) # for simple mode, we use the cli_feeder and any modules that don't require setup - yaml_config['steps']['feeders'] = ['cli_feeder'] + if not yaml_config['steps']['feeders']: + yaml_config['steps']['feeders'] = ['cli_feeder'] # add them to the config for module in simple_modules: @@ -141,30 +176,38 @@ class ArchivingOrchestrator: yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) else: # load all modules, they're not using the 'simple' mode - self.add_module_args(available_modules(with_manifest=True), parser) - + self.add_individual_module_args(available_modules(with_manifest=True), parser) + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them parsed, unknown = parser.parse_known_args(unused_args) - # merge the new config with the old one - self.config = merge_dicts(vars(parsed), yaml_config) + config = merge_dicts(vars(parsed), yaml_config) + # clean out args from the base_parser that we don't want in the config for key in vars(basic_config): - self.config.pop(key, None) + config.pop(key, None) # setup the logging - self.setup_logging() + self.setup_logging(config) if unknown: logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?") - if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): + if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_yaml(self.config, basic_config.config_file) + store_yaml(config, basic_config.config_file) - return self.config + return config + + def add_modules_args(self, parser: argparse.ArgumentParser = None): + if not parser: + parser = self.parser + + # Module loading from the command line + for module_type in BaseModule.MODULE_TYPES: + parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction) def add_additional_args(self, parser: argparse.ArgumentParser = None): if not parser: @@ -173,30 +216,24 @@ class ArchivingOrchestrator: # allow passing URLs directly on the command line parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml') - parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction) - parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction) - parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction) - parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction) - parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction) - parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction) - parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \ (token, username etc.) that extractors can use to log into \ a website. If passing this on the command line, use a JSON string. \ You may also pass a path to a valid JSON/YAML file which will be parsed.', default={}, + nargs="?", action=AuthenticationJsonParseAction) + # logging arguments parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper) parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) - def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: + def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: if not modules: modules = available_modules(with_manifest=True) - - module: LazyBaseModule + for module in modules: if not module.configs: @@ -226,18 +263,19 @@ class ArchivingOrchestrator: arg.should_store = should_store def show_help(self, basic_config: dict): - # for the help message, we want to load *all* possible modules and show the help + # for the help message, we want to load manifests from *all* possible modules and show their help/settings # add configs as arg parser arguments + self.add_modules_args(self.basic_parser) self.add_additional_args(self.basic_parser) - self.add_module_args(parser=self.basic_parser) + self.add_individual_module_args(parser=self.basic_parser) self.basic_parser.print_help() self.basic_parser.exit() - def setup_logging(self): + def setup_logging(self, config): # setup loguru logging logger.remove(0) # remove the default logger - logging_config = self.config['logging'] + logging_config = config['logging'] logger.add(sys.stderr, level=logging_config['level']) if log_file := logging_config['file']: logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) @@ -318,9 +356,9 @@ class ArchivingOrchestrator: return read_yaml(config_file) - def setup(self, args: list): + def setup_config(self, args: list) -> dict: """ - Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser + Sets up the configuration file, merging the default config with the user's config """ self.setup_basic_parser() @@ -333,9 +371,16 @@ class ArchivingOrchestrator: # if help flag was called, then show the help if basic_config.help: self.show_help(basic_config) - + # merge command line --feeder etc. args with what's in the yaml config yaml_config = self.load_config(basic_config.config_file) - self.setup_complete_parser(basic_config, yaml_config, unused_args) + + return self.setup_complete_parser(basic_config, yaml_config, unused_args) + + def setup(self, args: list): + """ + Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser + """ + self.config = self.setup_config(args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") self.install_modules(self.config['steps']) @@ -344,8 +389,18 @@ class ArchivingOrchestrator: for module_type in BaseModule.MODULE_TYPES: logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) - def run(self, args: list) -> Generator[Metadata]: + def _command_line_run(self, args: list) -> Generator[Metadata]: + """ + This is the main entry point for the orchestrator, when run from the command line. + :param args: list of arguments to pass to the orchestrator - these are the command line args + + You should not call this method from code implementations. + + This method sets up the configuration, loads the modules, and runs the feed. + If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately. + To test configurations, without loading any modules you can also first call 'setup_configs' + """ self.setup(args) return self.feed() diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index 7b74072..77026ea 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -10,7 +10,7 @@ "sheet": {"default": None, "help": "name of the sheet to archive"}, "sheet_id": { "default": None, - "help": "(alternative to sheet name) the id of the sheet to archive", + "help": "the id of the sheet to archive (alternative to 'sheet' config)", }, "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"}, "service_account": { diff --git a/src/auto_archiver/modules/html_formatter/templates/html_template.html b/src/auto_archiver/modules/html_formatter/templates/html_template.html index 8bdf5ef..62d6b0b 100644 --- a/src/auto_archiver/modules/html_formatter/templates/html_template.html +++ b/src/auto_archiver/modules/html_formatter/templates/html_template.html @@ -200,7 +200,7 @@ el.innerHTML = decodeCertificate(certificate); let cyberChefUrl = - `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`; + `https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`; // create a new anchor with this url and append after the code let a = document.createElement("a"); a.href = cyberChefUrl; diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py index 831959e..9829844 100644 --- a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py +++ b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py @@ -4,7 +4,6 @@ "requires_setup": True, "dependencies": { "python": ["loguru", "selenium"], - "bin": ["geckodriver"] }, "configs": { "width": {"default": 1280, "help": "width of the screenshots"}, diff --git a/tests/data/test_service_account.json b/tests/data/test_service_account.json new file mode 100644 index 0000000..5aae894 --- /dev/null +++ b/tests/data/test_service_account.json @@ -0,0 +1,14 @@ +{ + "type": "service_account", + "project_id": "some-project-id", + "private_key_id": "some-private-key-id", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDPlcaFJgt7HzoC\n4z0b18PzI2R5c892mLnNwRO8DOKid5INt6z5RAWKDPdnIyHjRBx74qNZl6768pia\nztQNgnud7mKcmvOvGrpUbFx2BdAw8xTyAlRVMalOBhUS9RKvjP5WgSwR5EKwfvzy\nrGioC6ml/segz5EchSaIzgASwB17ir0w6IrymBxUeNelfzCGJpCRhqG5nG+eEjct\nUYU0QIyihRD1Lq0f3Z3D0xfTLLZ630iFBj/Wr0BCJHkl6hdVuGhnyn4S98sMX1Bd\ntaJF/lWi4jdt7SoXD3+FWv66kHPpFfINMpReuB9u0ogfYkORgiRBOMhYBkGGQjUG\nOnBTxEc3AgMBAAECgf9bKiK8DdSz0ALzQbRLhgj2B9485jHI49wjgINOyceZ23uS\nQYXaO+DFLcgLqBkVSGanuHMpU0+qCpeM0v9yXSTIW8RguWMnFd8ID/yLRktxfQa1\n1FAQh+NlF4/gnuUoM8N/FYSy6R5grfaxwU8Qfg66IQXUB52OezSVu5lxNO4G5Rwv\nJ2e/+XYBUv/H26BnQSmjFCzbJkdbtrOeThpaLwLexKcollvoHKGyus0jpWg4C9Ez\n9EJaE+on4nd+cM1Vd+dWaHXoZ9Db9IvxPBqFJE8fynap7RDBeZK678OuCvQntrp4\nrTsE9hW8073Jhl/LbhfbDC0lhFR0JUHygVGE01ECgYEA+g+ddpGGY90yhhM76bTr\nkU6WwislMmfS0WDdLPemNgzLwCtkC2vsQgzg/egxqkVF5dJ9upiFhVgpYxY7ap9U\nSGFemb6T1ASl/1yeNhd0yc4PZFsJ29k+kNgSIlJYm9KDCIMqS1wPoXvFQhbMitOf\n/gLCPugxl67c+qg6nfuODTkCgYEA1IPngESOJnV8oa2WReWrO6+u6xb/OhqdmBzI\n5yq1z3f5gb98XESZR/rCH2vAOmHIJPn3XdZHsznOuxhZwGr1oztiRIurLmBlxQoL\n7tq0jDOUVSD2yeyQwKt5LaBH94P598FiauGxXM4raREWKtcNBGoOX1u1+kEBsoL4\ntf10Z+8CgYEA3QFkB+ECR8y91KW3NAzEjj5JG/8J9wyv1IGpuQ5/hhG1Gni/CSEv\nRAkh6QaIrpZe+ooYuQwIJhwPKBYEGW4MDZSRCYzYFnCtTY5L/j6o55sJG4cipX3R\nwC5XiKIC0mUxjhpvDP+miPBdHNYNnT0AkH1btEF/YzIW+Coq9GnZ2HECgYAOOpax\ne+WYpZ0mphy9qVcBtA2eJ/gGx+ltWeAJuk5aCcpm6Y9GDkHFFAETYX+JaSqhbysk\n2UgLs/8nf8XioEa6GyvFMyTPAh1OSBHseDBGgt2XpZFgi7pVbCW87FJlPCzsbcJN\nLbdWY2d8rWwyihuRBBjaQaW5j8ixTxuf88xreQKBgQCST4Fr8C5CkpakTA+KOost\nLOlziUBm0534mTg7dTcOE1H1+gxtqpXlXcJylpGz1lUXRlHCIutN5iPJcN5cxFES\nsP7wBd7BhficsMKDiWPm9XbP2zXVZu0ldUxA1mONMsS1P4p7i3Dh4uzrRDmSkTUL\njUpppYDumg3oM7wSJ6sTQA==\n-----END PRIVATE KEY-----", + "client_email": "some-email", + "client_id": "some-client-email", + "auth_uri": "https://example.com/o/oauth2/auth", + "token_uri": "https://oauth2.example.com/token", + "auth_provider_x509_cert_url": "https://www.example.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.example.com/robot/v1/metadata/x509/some-email", + "universe_domain": "example.com" + } + \ No newline at end of file diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py index c70a51f..54f4d9c 100644 --- a/tests/extractors/test_generic_extractor.py +++ b/tests/extractors/test_generic_extractor.py @@ -9,6 +9,7 @@ import pytest from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor from .test_extractor_base import TestExtractorBase +CI=os.getenv("GITHUB_ACTIONS", '') == 'true' class TestGenericExtractor(TestExtractorBase): """Tests Generic Extractor """ @@ -77,10 +78,11 @@ class TestGenericExtractor(TestExtractorBase): result = self.extractor.download(item) assert not result - + @pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.") @pytest.mark.download def test_youtube_download(self, make_item): # url https://www.youtube.com/watch?v=5qap5aO4i9A + item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ") result = self.extractor.download(item) assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ" @@ -114,6 +116,7 @@ class TestGenericExtractor(TestExtractorBase): result = self.extractor.download(item) assert result is not False + @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.") @pytest.mark.download def test_truthsocial_download_video(self, make_item): item = make_item("https://truthsocial.com/@DaynaTrueman/posts/110602446619561579") @@ -121,18 +124,21 @@ class TestGenericExtractor(TestExtractorBase): assert len(result.media) == 1 assert result is not False + @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.") @pytest.mark.download def test_truthsocial_download_no_media(self, make_item): item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628") result = self.extractor.download(item) assert result is not False + @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.") @pytest.mark.download def test_truthsocial_download_poll(self, make_item): item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098") result = self.extractor.download(item) assert result is not False + @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.") @pytest.mark.download def test_truthsocial_download_single_image(self, make_item): item = make_item("https://truthsocial.com/@mariabartiromo/posts/113861116433335006") @@ -140,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase): assert len(result.media) == 1 assert result is not False + @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.") @pytest.mark.download def test_truthsocial_download_multiple_images(self, make_item): item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135") diff --git a/tests/extractors/test_twitter_api_extractor.py b/tests/extractors/test_twitter_api_extractor.py index 004376c..26394ac 100644 --- a/tests/extractors/test_twitter_api_extractor.py +++ b/tests/extractors/test_twitter_api_extractor.py @@ -34,7 +34,7 @@ class TestTwitterApiExtractor(TestExtractorBase): @pytest.mark.download def test_sanitize_url_download(self): - assert "https://t.co/yl3oOJatFp" == self.extractor.sanitize_url("https://www.bellingcat.com/category/resources/") + assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp") @pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [ ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"), diff --git a/tests/test_implementation.py b/tests/test_implementation.py index 7e33651..85fc448 100644 --- a/tests/test_implementation.py +++ b/tests/test_implementation.py @@ -60,3 +60,15 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file): # should treat an empty file as if there is no file at all assert " No URLs provided. Please provide at least one URL via the com" in caplog.text + +def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path): + from auto_archiver.__main__ import main + + # monkey patch to change the current working directory, so that we don't use the user's real config file + monkeypatch.chdir(tmp_path) + with monkeypatch.context() as m: + m.setattr(sys, "argv", ["auto-archiver"]) + with pytest.raises(SystemExit): + main() + + assert "No URLs provided. Please provide at least one" in caplog.text \ No newline at end of file diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 5ba57d0..f93f8b8 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -75,18 +75,36 @@ def test_help(orchestrator, basic_parser, capsys): orchestrator.show_help(args) assert exit_error.value.code == 0 - assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in capsys.readouterr().out + + logs = capsys.readouterr().out + assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in logs + + # basic config options + assert "--version" in logs + + # setting modules options + assert "--feeders" in logs + assert "--extractors" in logs + + # authentication options + assert "--authentication" in logs + + # logging options + assert "--logging.level" in logs + + # individual module configs + assert "--gsheet_feeder.sheet_id" in logs def test_add_custom_modules_path(orchestrator, test_args): - orchestrator.run(test_args) + orchestrator.setup_config(test_args) import auto_archiver assert "tests/data/test_modules/" in auto_archiver.modules.__path__ def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args): - orchestrator.run(test_args + # we still need to load the real path to get the example_module + orchestrator.setup_config(test_args + # we still need to load the real path to get the example_module ["--module_paths", "tests/data/invalid_test_modules/"]) assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..." @@ -97,7 +115,7 @@ def test_check_required_values(orchestrator, caplog, test_args): test_args = test_args[:-2] with pytest.raises(SystemExit) as exit_error: - orchestrator.run(test_args) + config = orchestrator.setup_config(test_args) assert caplog.records[1].message == "the following arguments are required: --example_module.required_field" @@ -111,24 +129,50 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path): store_yaml(test_yaml, tmp_file) # run the orchestrator - orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES]) - assert orchestrator.config is not None + config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES]) + assert config is not None def test_load_authentication_string(orchestrator, test_args): - orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']) - assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}} + config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']) + assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}} def test_load_authentication_string_concat_site(orchestrator, test_args): - orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}']) - assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"}, + config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}']) + assert config['authentication'] == {"x.com": {"api_key": "my_key"}, "twitter.com": {"api_key": "my_key"}} def test_load_invalid_authentication_string(orchestrator, test_args): with pytest.raises(ArgumentTypeError): - orchestrator.run(test_args + ["--authentication", "{\''invalid_json"]) + orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"]) def test_load_authentication_invalid_dict(orchestrator, test_args): with pytest.raises(ArgumentTypeError): - orchestrator.run(test_args + ["--authentication", "[true, false]"]) \ No newline at end of file + orchestrator.setup_config(test_args + ["--authentication", "[true, false]"]) + +def test_load_modules_from_commandline(orchestrator, test_args): + args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"] + + orchestrator.setup(args) + + assert len(orchestrator.feeders) == 1 + assert len(orchestrator.extractors) == 1 + assert len(orchestrator.databases) == 1 + assert len(orchestrator.enrichers) == 1 + assert len(orchestrator.formatters) == 1 + + assert orchestrator.feeders[0].name == "example_module" + assert orchestrator.extractors[0].name == "example_module" + assert orchestrator.databases[0].name == "example_module" + assert orchestrator.enrichers[0].name == "example_module" + assert orchestrator.formatters[0].name == "example_module" + +def test_load_settings_for_module_from_commandline(orchestrator, test_args): + args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"] + + orchestrator.setup(args) + + assert len(orchestrator.feeders) == 1 + assert orchestrator.feeders[0].name == "gsheet_feeder" + assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123" \ No newline at end of file