Various fixes for issues with new architecture (#208)

* Add formatters to the TOC - fixes #204

* Add 'steps' settings to the example YAML in the docs. Fixes #206

* Improved docs on authentication architecture

* Fix setting modules on the command line - they now override any module settings in the orchestration as opposed to appending

* Fix tests for gsheet-feeder: add a test service_account.json (note: not real keys in there)

* Rename the command line entrypoint to _command_line_run

Also: make it clear that code implementation should not call this
Make sure the command line entry returns (we don't want a generator)

* Fix unit tests to use now code-entry points

* Version bump

* Move iterating of generator up to __main__

* Breakpoint

* two minor fixes

* Fix unit tests + add new '__main__' entry point implementation test

* Skip youtube tests if running on CI. Should still run them locally

* Fix full implementation run on GH actions

* Fix skipif test for GH Actions CI

* Add skipifs for truth - it blocks GH:

---------

Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
pull/210/head v0.13.3
Patrick Robertson 2025-02-18 19:10:09 +00:00 zatwierdzone przez GitHub
rodzic 6d43bc7d4d
commit 3c543a3a6a
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
18 zmienionych plików z 314 dodań i 84 usunięć

Wyświetl plik

@ -19,6 +19,19 @@ type_color = {
TABLE_HEADER = ("Option", "Description", "Default", "Type")
EXAMPLE_YAML = """
# steps configuration
steps:
...
{steps_str}
...
# module configuration
...
{config_string}
"""
def generate_module_docs():
yaml = YAML()
SAVE_FOLDER.mkdir(exist_ok=True)
@ -45,11 +58,14 @@ def generate_module_docs():
```
{description}
"""
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest['type'])
if not manifest['configs']:
readme_str += "\n*This module has no configuration options.*\n"
config_string = f"# No configuration options for {module.name}.*\n"
else:
config_yaml = {}
config_table = header_row
config_yaml = {}
for key, value in manifest['configs'].items():
type = value.get('type', 'string')
if type == 'auto_archiver.utils.json_loader':
@ -65,11 +81,14 @@ def generate_module_docs():
configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
readme_str += "\n## Configuration Options\n"
readme_str += "\n### YAML\n"
yaml_string = io.BytesIO()
yaml.dump({module.name: config_yaml}, yaml_string)
readme_str += f"```{{code}} yaml\n{yaml_string.getvalue().decode('utf-8')}\n```\n"
config_string = io.BytesIO()
yaml.dump({module.name: config_yaml}, config_string)
config_string = config_string.getvalue().decode('utf-8')
yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string)
readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n"
if manifest['configs']:
readme_str += "\n### Command Line:\n"
readme_str += config_table
@ -103,3 +122,7 @@ def generate_index(modules_by_type):
with open(SAVE_FOLDER / "module_list.md", "w") as f:
print("writing", SAVE_FOLDER / "module_list.md")
f.write(readme_str)
if __name__ == "__main__":
generate_module_docs()

Wyświetl plik

@ -77,3 +77,6 @@ html_theme = 'sphinx_book_theme'
html_static_path = ["../_static"]
html_css_files = ["custom.css"]
copybutton_prompt_text = r">>> |\.\.\."
copybutton_prompt_is_regexp = True
copybutton_only_copy_prompt_lines = False

Wyświetl plik

@ -24,4 +24,5 @@ modules/extractor
modules/enricher
modules/storage
modules/database
modules/formatter
```

Wyświetl plik

@ -45,3 +45,10 @@ The "archive location" link contains the path of the archived file, in local sto
![The archive result for a link in the demo sheet.](../demo-archive.png)
---
```{toctree}
:maxdepth: 1
:glob:
how_to/*
```

Wyświetl plik

@ -0,0 +1,57 @@
# Authentication
The Authentication framework for auto-archiver allows you to add login details for various websites in a flexible way, directly from the configuration file.
There are two main use cases for authentication:
* Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this.
## The Authentication Config
You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same.
```{code} yaml
authentication:
# optional file to load authentication information from, for security or multi-system deploy purposes
load_from_file: path/to/authentication/file.txt
# optional setting to load cookies from the named browser on the system.
cookies_from_browser: firefox
# optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these
cookies_file: path/to/cookies.jar
twitter.com,x.com:
username: myusername
password: 123
facebook.com:
cookie: single_cookie
othersite.com:
api_key: 123
api_secret: 1234
# All available options:
# - username: str - the username to use for login
# - password: str - the password to use for login
# - api_key: str - the API key to use for login
# - api_secret: str - the API secret to use for login
# - cookie: str - a cookie string to use for login (specific to this site)
```
### Recommendations for authentication
1. **Store authentication information separately:**
The authentication part of your configuration contains sensitive information. You should make efforts not to share this with others. For extra security, use the `load_from_file` option to keep your authentication settings out of your configuration file, ideally in a different folder.
2. **Don't use your own personal credentials**
Depending on the website you are extracting information from, there may be rules (Terms of Service) that prohibit you from scraping or extracting information using a bot. If you use your own personal account, there's a possibility it might get blocked/disabled. It's recommended to set up a separate, 'throwaway' account. In that way, if it gets blocked you can easily create another one to continue your archiving.
### How to create a cookies.jar or pass cookies directly to auto-archiver
auto-archiver uses yt-dlp's powerful cookies features under the hood. For instructions on how to extract a cookies.jar (or cookies.txt) file directly from your browser, see the FAQ in the [yt-dlp documentation](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp)
```{note} For developers:
For information on how to access and use authentication settings from within your module, see the `{generic_extractor}` for an example, or view the [`auth_for_site()` function in BaseModule](../autoapi/core/base_module/index.rst)
```

Wyświetl plik

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project]
name = "auto-archiver"
version = "0.13.2"
version = "0.13.3"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13"

Wyświetl plik

@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
import sys
def main():
ArchivingOrchestrator().run(sys.argv[1:])
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass
if __name__ == "__main__":
main()

Wyświetl plik

@ -63,12 +63,6 @@ class BaseModule(ABC):
def config_setup(self, config: dict):
authentication = config.get('authentication', {})
# extract out concatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):
authentication[site] = val
del authentication[key]
# this is important. Each instance is given its own deepcopied config, so modules cannot
# change values to affect other modules
@ -89,16 +83,21 @@ class BaseModule(ABC):
Returns the authentication information for a given site. This is used to authenticate
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
extract_cookies: bool - whether or not to extract cookies from the given browser and return the
cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
:param site: the domain of the site to get authentication information for
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
Currently, the dict can have keys of the following types:
- username: str - the username to use for login
- password: str - the password to use for login
- api_key: str - the API key to use for login
- api_secret: str - the API secret to use for login
- cookie: str - a cookie string to use for login (specific to this site)
- cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
:returns: authdict dict of login information for the given site
**Global options:**\n
* cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
* cookies_file: str - the path to a cookies file to use for login\n
**Currently, the sites dict can have keys of the following types:**\n
* username: str - the username to use for login\n
* password: str - the password to use for login\n
* api_key: str - the API key to use for login\n
* api_secret: str - the API secret to use for login\n
* cookie: str - a cookie string to use for login (specific to this site)\n
"""
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?

Wyświetl plik

@ -129,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
yaml_subdict[key] = value
continue
if key == 'steps':
for module_type, modules in value.items():
# overwrite the 'steps' from the config file with the ones from the CLI
yaml_subdict[key][module_type] = modules
if is_dict_type(value):
update_dict(value, yaml_subdict[key])
elif is_list_type(value):
@ -137,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
yaml_subdict[key] = value
update_dict(from_dot_notation(dotdict), yaml_dict)
return yaml_dict
def read_yaml(yaml_filename: str) -> CommentedMap:
@ -159,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save = deepcopy(config)
auth_dict = config_to_save.get("authentication", {})
if auth_dict and auth_dict.get('load_from_file'):
# remove all other values from the config, don't want to store it in the config file
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
config_to_save.pop('urls', None)
with open(yaml_filename, "w", encoding="utf-8") as outf:
_yaml.dump(config_to_save, outf)

Wyświetl plik

@ -8,6 +8,7 @@ from __future__ import annotations
from typing import Generator, Union, List, Type
from urllib.parse import urlparse
from ipaddress import ip_address
from copy import copy
import argparse
import os
import sys
@ -43,30 +44,50 @@ class AuthenticationJsonParseAction(JsonParseAction):
def __call__(self, parser, namespace, values, option_string=None):
super().__call__(parser, namespace, values, option_string)
auth_dict = getattr(namespace, self.dest)
if isinstance(auth_dict, str):
# if it's a string
def load_from_file(path):
try:
with open(auth_dict, 'r') as f:
with open(path, 'r') as f:
try:
auth_dict = json.load(f)
except json.JSONDecodeError:
f.seek(0)
# maybe it's yaml, try that
auth_dict = _yaml.load(f)
if auth_dict.get('authentication'):
auth_dict = auth_dict['authentication']
auth_dict['load_from_file'] = path
return auth_dict
except:
pass
return None
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
auth_dict = load_from_file(auth_dict['from_file'])
elif isinstance(auth_dict, str):
# if it's a string
auth_dict = load_from_file(auth_dict)
if not isinstance(auth_dict, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
for site, auth in auth_dict.items():
if not isinstance(site, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
for key, auth in auth_dict.items():
if key in global_options:
continue
if not isinstance(key, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
# extract out concatenated sites
for key, val in copy(auth_dict).items():
if "," in key:
for site in key.split(","):
auth_dict[site] = val
del auth_dict[key]
setattr(namespace, self.dest, auth_dict)
class UniqueAppendAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
if not hasattr(namespace, self.dest):
setattr(namespace, self.dest, [])
for value in values:
if value not in getattr(namespace, self.dest):
getattr(namespace, self.dest).append(value)
@ -104,36 +125,50 @@ class ArchivingOrchestrator:
return parser
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
# modules parser to get the overridden 'steps' values
modules_parser = argparse.ArgumentParser(
add_help=False,
)
self.add_modules_args(modules_parser)
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
for module_type in BaseModule.MODULE_TYPES:
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
parser = DefaultValidatingParser(
add_help=False,
)
self.add_additional_args(parser)
# merge command line module args (--feeders, --enrichers etc.) and add them to the config
# check what mode we're in
# if we have a config file, use that to decide which modules to load
# if simple, we'll load just the modules that has requires_setup = False
# if full, we'll load all modules
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
# but should we add them? Or should we just add them to the 'complete' parser?
if yaml_config != EMPTY_CONFIG:
# only load the modules enabled in config
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = []
# first loads the modules from the config file, then from the command line
for config in [yaml_config['steps'], basic_config.__dict__]:
for module_type in BaseModule.MODULE_TYPES:
enabled_modules.extend(config.get(f"{module_type}s", []))
for module_type in BaseModule.MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_module_args(avail_modules, parser)
self.add_individual_module_args(avail_modules, parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
self.add_individual_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup
yaml_config['steps']['feeders'] = ['cli_feeder']
if not yaml_config['steps']['feeders']:
yaml_config['steps']['feeders'] = ['cli_feeder']
# add them to the config
for module in simple_modules:
@ -141,30 +176,38 @@ class ArchivingOrchestrator:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else:
# load all modules, they're not using the 'simple' mode
self.add_module_args(available_modules(with_manifest=True), parser)
self.add_individual_module_args(available_modules(with_manifest=True), parser)
parser.set_defaults(**to_dot_notation(yaml_config))
# reload the parser with the new arguments, now that we have them
parsed, unknown = parser.parse_known_args(unused_args)
# merge the new config with the old one
self.config = merge_dicts(vars(parsed), yaml_config)
config = merge_dicts(vars(parsed), yaml_config)
# clean out args from the base_parser that we don't want in the config
for key in vars(basic_config):
self.config.pop(key, None)
config.pop(key, None)
# setup the logging
self.setup_logging()
self.setup_logging(config)
if unknown:
logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
logger.info(f"Storing configuration file to {basic_config.config_file}")
store_yaml(self.config, basic_config.config_file)
store_yaml(config, basic_config.config_file)
return self.config
return config
def add_modules_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
# Module loading from the command line
for module_type in BaseModule.MODULE_TYPES:
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
def add_additional_args(self, parser: argparse.ArgumentParser = None):
if not parser:
@ -173,30 +216,24 @@ class ArchivingOrchestrator:
# allow passing URLs directly on the command line
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
(token, username etc.) that extractors can use to log into \
a website. If passing this on the command line, use a JSON string. \
You may also pass a path to a valid JSON/YAML file which will be parsed.',
default={},
nargs="?",
action=AuthenticationJsonParseAction)
# logging arguments
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules:
modules = available_modules(with_manifest=True)
module: LazyBaseModule
for module in modules:
if not module.configs:
@ -226,18 +263,19 @@ class ArchivingOrchestrator:
arg.should_store = should_store
def show_help(self, basic_config: dict):
# for the help message, we want to load *all* possible modules and show the help
# for the help message, we want to load manifests from *all* possible modules and show their help/settings
# add configs as arg parser arguments
self.add_modules_args(self.basic_parser)
self.add_additional_args(self.basic_parser)
self.add_module_args(parser=self.basic_parser)
self.add_individual_module_args(parser=self.basic_parser)
self.basic_parser.print_help()
self.basic_parser.exit()
def setup_logging(self):
def setup_logging(self, config):
# setup loguru logging
logger.remove(0) # remove the default logger
logging_config = self.config['logging']
logging_config = config['logging']
logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
@ -318,9 +356,9 @@ class ArchivingOrchestrator:
return read_yaml(config_file)
def setup(self, args: list):
def setup_config(self, args: list) -> dict:
"""
Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
Sets up the configuration file, merging the default config with the user's config
"""
self.setup_basic_parser()
@ -333,9 +371,16 @@ class ArchivingOrchestrator:
# if help flag was called, then show the help
if basic_config.help:
self.show_help(basic_config)
# merge command line --feeder etc. args with what's in the yaml config
yaml_config = self.load_config(basic_config.config_file)
self.setup_complete_parser(basic_config, yaml_config, unused_args)
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
def setup(self, args: list):
"""
Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
"""
self.config = self.setup_config(args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules(self.config['steps'])
@ -344,8 +389,18 @@ class ArchivingOrchestrator:
for module_type in BaseModule.MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
def run(self, args: list) -> Generator[Metadata]:
def _command_line_run(self, args: list) -> Generator[Metadata]:
"""
This is the main entry point for the orchestrator, when run from the command line.
:param args: list of arguments to pass to the orchestrator - these are the command line args
You should not call this method from code implementations.
This method sets up the configuration, loads the modules, and runs the feed.
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
To test configurations, without loading any modules you can also first call 'setup_configs'
"""
self.setup(args)
return self.feed()

Wyświetl plik

@ -10,7 +10,7 @@
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {
"default": None,
"help": "(alternative to sheet name) the id of the sheet to archive",
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
},
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {

Wyświetl plik

@ -200,7 +200,7 @@
el.innerHTML = decodeCertificate(certificate);
let cyberChefUrl =
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
// create a new anchor with this url and append after the code
let a = document.createElement("a");
a.href = cyberChefUrl;

Wyświetl plik

@ -4,7 +4,6 @@
"requires_setup": True,
"dependencies": {
"python": ["loguru", "selenium"],
"bin": ["geckodriver"]
},
"configs": {
"width": {"default": 1280, "help": "width of the screenshots"},

Wyświetl plik

@ -0,0 +1,14 @@
{
"type": "service_account",
"project_id": "some-project-id",
"private_key_id": "some-private-key-id",
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDPlcaFJgt7HzoC\n4z0b18PzI2R5c892mLnNwRO8DOKid5INt6z5RAWKDPdnIyHjRBx74qNZl6768pia\nztQNgnud7mKcmvOvGrpUbFx2BdAw8xTyAlRVMalOBhUS9RKvjP5WgSwR5EKwfvzy\nrGioC6ml/segz5EchSaIzgASwB17ir0w6IrymBxUeNelfzCGJpCRhqG5nG+eEjct\nUYU0QIyihRD1Lq0f3Z3D0xfTLLZ630iFBj/Wr0BCJHkl6hdVuGhnyn4S98sMX1Bd\ntaJF/lWi4jdt7SoXD3+FWv66kHPpFfINMpReuB9u0ogfYkORgiRBOMhYBkGGQjUG\nOnBTxEc3AgMBAAECgf9bKiK8DdSz0ALzQbRLhgj2B9485jHI49wjgINOyceZ23uS\nQYXaO+DFLcgLqBkVSGanuHMpU0+qCpeM0v9yXSTIW8RguWMnFd8ID/yLRktxfQa1\n1FAQh+NlF4/gnuUoM8N/FYSy6R5grfaxwU8Qfg66IQXUB52OezSVu5lxNO4G5Rwv\nJ2e/+XYBUv/H26BnQSmjFCzbJkdbtrOeThpaLwLexKcollvoHKGyus0jpWg4C9Ez\n9EJaE+on4nd+cM1Vd+dWaHXoZ9Db9IvxPBqFJE8fynap7RDBeZK678OuCvQntrp4\nrTsE9hW8073Jhl/LbhfbDC0lhFR0JUHygVGE01ECgYEA+g+ddpGGY90yhhM76bTr\nkU6WwislMmfS0WDdLPemNgzLwCtkC2vsQgzg/egxqkVF5dJ9upiFhVgpYxY7ap9U\nSGFemb6T1ASl/1yeNhd0yc4PZFsJ29k+kNgSIlJYm9KDCIMqS1wPoXvFQhbMitOf\n/gLCPugxl67c+qg6nfuODTkCgYEA1IPngESOJnV8oa2WReWrO6+u6xb/OhqdmBzI\n5yq1z3f5gb98XESZR/rCH2vAOmHIJPn3XdZHsznOuxhZwGr1oztiRIurLmBlxQoL\n7tq0jDOUVSD2yeyQwKt5LaBH94P598FiauGxXM4raREWKtcNBGoOX1u1+kEBsoL4\ntf10Z+8CgYEA3QFkB+ECR8y91KW3NAzEjj5JG/8J9wyv1IGpuQ5/hhG1Gni/CSEv\nRAkh6QaIrpZe+ooYuQwIJhwPKBYEGW4MDZSRCYzYFnCtTY5L/j6o55sJG4cipX3R\nwC5XiKIC0mUxjhpvDP+miPBdHNYNnT0AkH1btEF/YzIW+Coq9GnZ2HECgYAOOpax\ne+WYpZ0mphy9qVcBtA2eJ/gGx+ltWeAJuk5aCcpm6Y9GDkHFFAETYX+JaSqhbysk\n2UgLs/8nf8XioEa6GyvFMyTPAh1OSBHseDBGgt2XpZFgi7pVbCW87FJlPCzsbcJN\nLbdWY2d8rWwyihuRBBjaQaW5j8ixTxuf88xreQKBgQCST4Fr8C5CkpakTA+KOost\nLOlziUBm0534mTg7dTcOE1H1+gxtqpXlXcJylpGz1lUXRlHCIutN5iPJcN5cxFES\nsP7wBd7BhficsMKDiWPm9XbP2zXVZu0ldUxA1mONMsS1P4p7i3Dh4uzrRDmSkTUL\njUpppYDumg3oM7wSJ6sTQA==\n-----END PRIVATE KEY-----",
"client_email": "some-email",
"client_id": "some-client-email",
"auth_uri": "https://example.com/o/oauth2/auth",
"token_uri": "https://oauth2.example.com/token",
"auth_provider_x509_cert_url": "https://www.example.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.example.com/robot/v1/metadata/x509/some-email",
"universe_domain": "example.com"
}

Wyświetl plik

@ -9,6 +9,7 @@ import pytest
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from .test_extractor_base import TestExtractorBase
CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
class TestGenericExtractor(TestExtractorBase):
"""Tests Generic Extractor
"""
@ -77,10 +78,11 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert not result
@pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
@pytest.mark.download
def test_youtube_download(self, make_item):
# url https://www.youtube.com/watch?v=5qap5aO4i9A
item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
result = self.extractor.download(item)
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
@ -114,6 +116,7 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_video(self, make_item):
item = make_item("https://truthsocial.com/@DaynaTrueman/posts/110602446619561579")
@ -121,18 +124,21 @@ class TestGenericExtractor(TestExtractorBase):
assert len(result.media) == 1
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_no_media(self, make_item):
item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_poll(self, make_item):
item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_single_image(self, make_item):
item = make_item("https://truthsocial.com/@mariabartiromo/posts/113861116433335006")
@ -140,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
assert len(result.media) == 1
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_multiple_images(self, make_item):
item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")

Wyświetl plik

@ -34,7 +34,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
@pytest.mark.download
def test_sanitize_url_download(self):
assert "https://t.co/yl3oOJatFp" == self.extractor.sanitize_url("https://www.bellingcat.com/category/resources/")
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),

Wyświetl plik

@ -60,3 +60,15 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
# should treat an empty file as if there is no file at all
assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
from auto_archiver.__main__ import main
# monkey patch to change the current working directory, so that we don't use the user's real config file
monkeypatch.chdir(tmp_path)
with monkeypatch.context() as m:
m.setattr(sys, "argv", ["auto-archiver"])
with pytest.raises(SystemExit):
main()
assert "No URLs provided. Please provide at least one" in caplog.text

Wyświetl plik

@ -75,18 +75,36 @@ def test_help(orchestrator, basic_parser, capsys):
orchestrator.show_help(args)
assert exit_error.value.code == 0
assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in capsys.readouterr().out
logs = capsys.readouterr().out
assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in logs
# basic config options
assert "--version" in logs
# setting modules options
assert "--feeders" in logs
assert "--extractors" in logs
# authentication options
assert "--authentication" in logs
# logging options
assert "--logging.level" in logs
# individual module configs
assert "--gsheet_feeder.sheet_id" in logs
def test_add_custom_modules_path(orchestrator, test_args):
orchestrator.run(test_args)
orchestrator.setup_config(test_args)
import auto_archiver
assert "tests/data/test_modules/" in auto_archiver.modules.__path__
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
orchestrator.run(test_args + # we still need to load the real path to get the example_module
orchestrator.setup_config(test_args + # we still need to load the real path to get the example_module
["--module_paths", "tests/data/invalid_test_modules/"])
assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
@ -97,7 +115,7 @@ def test_check_required_values(orchestrator, caplog, test_args):
test_args = test_args[:-2]
with pytest.raises(SystemExit) as exit_error:
orchestrator.run(test_args)
config = orchestrator.setup_config(test_args)
assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
@ -111,24 +129,50 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
store_yaml(test_yaml, tmp_file)
# run the orchestrator
orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES])
assert orchestrator.config is not None
config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
assert config is not None
def test_load_authentication_string(orchestrator, test_args):
orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
def test_load_authentication_string_concat_site(orchestrator, test_args):
orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"},
config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
assert config['authentication'] == {"x.com": {"api_key": "my_key"},
"twitter.com": {"api_key": "my_key"}}
def test_load_invalid_authentication_string(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.run(test_args + ["--authentication", "{\''invalid_json"])
orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])
def test_load_authentication_invalid_dict(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.run(test_args + ["--authentication", "[true, false]"])
orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])
def test_load_modules_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
orchestrator.setup(args)
assert len(orchestrator.feeders) == 1
assert len(orchestrator.extractors) == 1
assert len(orchestrator.databases) == 1
assert len(orchestrator.enrichers) == 1
assert len(orchestrator.formatters) == 1
assert orchestrator.feeders[0].name == "example_module"
assert orchestrator.extractors[0].name == "example_module"
assert orchestrator.databases[0].name == "example_module"
assert orchestrator.enrichers[0].name == "example_module"
assert orchestrator.formatters[0].name == "example_module"
def test_load_settings_for_module_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
orchestrator.setup(args)
assert len(orchestrator.feeders) == 1
assert orchestrator.feeders[0].name == "gsheet_feeder"
assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"