Merge branch 'main' into linting_etc

# Conflicts:
#	src/auto_archiver/core/consts.py
#	src/auto_archiver/core/orchestrator.py
#	src/auto_archiver/core/storage.py
#	src/auto_archiver/modules/local_storage/local_storage.py
#	src/auto_archiver/modules/s3_storage/s3_storage.py
#	tests/storages/test_S3_storage.py
#	tests/storages/test_local_storage.py
#	tests/storages/test_storage_base.py
pull/244/head
erinhmclark 2025-03-11 10:39:47 +00:00
commit 441f341139
19 zmienionych plików z 614 dodań i 320 usunięć

Wyświetl plik

@ -10,7 +10,7 @@
Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.
Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can be enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.
<div class="hidden_rtd">

Wyświetl plik

@ -1,15 +1,25 @@
MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]
class SetupError(ValueError):
pass
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
DEFAULT_MANIFEST = {
"name": "", # the display name of the module
"author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name!
"type": [], # the type of the module, can be one or more of MODULE_TYPES
"requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
"description": "", # a description of the module
"dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format
"entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
"version": "1.0", # the version of the module
"configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}

Wyświetl plik

@ -6,7 +6,7 @@ nested media retrieval, and type validation.
from __future__ import annotations
import os
import traceback
from typing import Any, List
from typing import Any, List, Iterator
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
@ -21,15 +21,14 @@ class Media:
Represents a media file with associated properties and storage details.
Attributes:
- filename: The file path of the media.
- key: An optional identifier for the media.
- filename: The file path of the media as saved locally (temporarily, before uploading to the storage).
- urls: A list of URLs where the media is stored or accessible.
- properties: Additional metadata or transformations for the media.
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
"""
filename: str
key: str = None
_key: str = None
urls: List[str] = field(default_factory=list)
properties: dict = field(default_factory=dict)
_mimetype: str = None # eg: image/jpeg
@ -48,7 +47,7 @@ class Media:
for any_media in self.all_inner_media(include_self=True):
s.store(any_media, url, metadata=metadata)
def all_inner_media(self, include_self=False):
def all_inner_media(self, include_self=False) -> Iterator[Media]:
"""Retrieves all media, including nested media within properties or transformations on original media.
This function returns a generator for all the inner media.
@ -69,6 +68,10 @@ class Media:
# checks if the media is already stored in the given storage
return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
@property
def key(self) -> str:
return self._key
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value
return self

Wyświetl plik

@ -1,6 +1,6 @@
"""Orchestrates all archiving steps, including feeding items,
archiving them with specific archivers, enrichment, storage,
formatting, database operations and clean up.
""" Orchestrates all archiving steps, including feeding items,
archiving them with specific archivers, enrichment, storage,
formatting, database operations and clean up.
"""
@ -19,32 +19,19 @@ import requests
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import (
read_yaml,
store_yaml,
to_dot_notation,
merge_dicts,
is_valid_config,
DefaultValidatingParser,
UniqueAppendAction,
AuthenticationJsonParseAction,
DEFAULT_CONFIG_FILE,
)
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .consts import MODULE_TYPES
from .consts import MODULE_TYPES, SetupError
from auto_archiver.utils.url import check_url_or_raise
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
class SetupError(ValueError):
pass
class ArchivingOrchestrator:
# instance variables
module_factory: ModuleFactory
setup_finished: bool
@ -74,63 +61,30 @@ class ArchivingOrchestrator:
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
formatter_class=RichHelpFormatter,
)
parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
parser.add_argument("--version", action="version", version=__version__)
parser.add_argument(
"--config",
action="store",
dest="config_file",
help="the filename of the YAML configuration file (defaults to 'config.yaml')",
default=DEFAULT_CONFIG_FILE,
)
parser.add_argument(
"--mode",
action="store",
dest="mode",
type=str,
choices=["simple", "full"],
help="the mode to run the archiver in",
default="simple",
)
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
# override the default 'help' so we can inject all the configs and show those
parser.add_argument(
"-s",
"--store",
dest="store",
default=False,
help="Store the created config in the config file",
action=argparse.BooleanOptionalAction,
)
parser.add_argument(
"--module_paths",
dest="module_paths",
nargs="+",
default=[],
help="additional paths to search for modules",
action=UniqueAppendAction,
)
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
self.basic_parser = parser
return parser
def check_steps(self, config):
for module_type in MODULE_TYPES:
if not config["steps"].get(f"{module_type}s", []):
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
raise SetupError(
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
)
if module_type == "extractor" and config["steps"].get("archivers"):
raise SetupError(
"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
)
raise SetupError(
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
)
if not config['steps'].get(f"{module_type}s", []):
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
if module_type == 'extractor' and config['steps'].get('archivers'):
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
# modules parser to get the overridden 'steps' values
modules_parser = argparse.ArgumentParser(
add_help=False,
@ -138,9 +92,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
self.add_modules_args(modules_parser)
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
for module_type in MODULE_TYPES:
yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
"steps"
].get(f"{module_type}s", [])
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
parser = DefaultValidatingParser(
add_help=False,
@ -163,32 +115,30 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
enabled_modules = []
# first loads the modules from the config file, then from the command line
for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = self.module_factory.available_modules(
limit_to_modules=enabled_modules, suppress_warnings=True
)
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_individual_module_args(avail_modules, parser)
elif basic_config.mode == "simple":
elif basic_config.mode == 'simple':
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
self.add_individual_module_args(simple_modules, parser)
# add them to the config
for module in simple_modules:
for module_type in module.type:
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else:
# load all modules, they're not using the 'simple' mode
all_modules = self.module_factory.available_modules()
# add all the modules to the steps
for module in all_modules:
for module_type in module.type:
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
self.add_individual_module_args(all_modules, parser)
parser.set_defaults(**to_dot_notation(yaml_config))
# reload the parser with the new arguments, now that we have them
@ -214,76 +164,43 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
store_yaml(config, basic_config.config_file)
return config
def add_modules_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
# Module loading from the command line
for module_type in MODULE_TYPES:
parser.add_argument(
f"--{module_type}s",
dest=f"{module_type}s",
nargs="+",
help=f"the {module_type}s to use",
default=[],
action=UniqueAppendAction,
)
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
def add_additional_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
parser.add_argument(
"--authentication",
dest="authentication",
help="A dictionary of sites and their authentication methods \
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
(token, username etc.) that extractors can use to log into \
a website. If passing this on the command line, use a JSON string. \
You may also pass a path to a valid JSON/YAML file which will be parsed.",
default={},
nargs="?",
action=AuthenticationJsonParseAction,
)
You may also pass a path to a valid JSON/YAML file which will be parsed.',
default={},
nargs="?",
action=AuthenticationJsonParseAction)
# logging arguments
parser.add_argument(
"--logging.level",
action="store",
dest="logging.level",
choices=["INFO", "DEBUG", "ERROR", "WARNING"],
help="the logging level to use",
default="INFO",
type=str.upper,
)
parser.add_argument(
"--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
)
parser.add_argument(
"--logging.rotation",
action="store",
dest="logging.rotation",
help="the logging rotation to use",
default=None,
)
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
def add_individual_module_args(
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
) -> None:
if not modules:
modules = self.module_factory.available_modules()
for module in modules:
if module.name == "cli_feeder":
if module.name == 'cli_feeder':
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
parser.add_argument(
"urls",
nargs="*",
default=[],
help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
)
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
continue
if not module.configs:
# this module has no configs, don't show anything in the help
# (TODO: do we want to show something about this module though, like a description?)
@ -292,21 +209,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
for name, kwargs in module.configs.items():
if not kwargs.get("metavar", None):
if not kwargs.get('metavar', None):
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
kwargs["metavar"] = name.upper()
kwargs['metavar'] = name.upper()
if kwargs.get("required", False):
if kwargs.get('required', False):
# required args shouldn't have a 'default' value, remove it
kwargs.pop("default", None)
kwargs.pop('default', None)
kwargs.pop("cli_set", None)
should_store = kwargs.pop("should_store", False)
kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
kwargs.pop('cli_set', None)
should_store = kwargs.pop('should_store', False)
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
try:
kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
except AttributeError:
kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
arg.should_store = should_store
@ -321,11 +238,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
self.basic_parser.exit()
def setup_logging(self, config):
logging_config = config["logging"]
if logging_config.get("enabled", True) is False:
logging_config = config['logging']
if logging_config.get('enabled', True) is False:
# disabled logging settings, they're set on a higher level
logger.disable("auto_archiver")
logger.disable('auto_archiver')
return
# setup loguru logging
@ -335,45 +253,38 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
pass
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
if log_file := logging_config["file"]:
logger.add(log_file) if not logging_config["rotation"] else logger.add(
log_file, rotation=logging_config["rotation"]
)
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
def install_modules(self, modules_by_type):
"""
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
are loaded, the program will exit with an error message.
"""
invalid_modules = []
for module_type in MODULE_TYPES:
step_items = []
modules_to_load = modules_by_type[f"{module_type}s"]
if not modules_to_load:
raise SetupError(
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
)
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
def check_steps_ok():
if not len(step_items):
if len(modules_to_load):
logger.error(
f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
)
raise SetupError(
f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
)
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
raise SetupError(
f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
)
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
for module in modules_to_load:
if module in invalid_modules:
continue
@ -382,7 +293,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if loaded_module and module_type == "extractor":
if loaded_module and module_type == 'extractor':
loaded_module.cleanup()
raise e
@ -397,13 +308,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
def load_config(self, config_file: str) -> dict:
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
logger.error(
f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
)
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
raise FileNotFoundError(f"Configuration file {config_file} not found")
return read_yaml(config_file)
def setup_config(self, args: list) -> dict:
"""
Sets up the configuration file, merging the default config with the user's config
@ -426,13 +335,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
yaml_config = self.load_config(basic_config.config_file)
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
def check_for_updates(self):
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
latest_version = response["info"]["version"]
latest_version = response['info']['version']
# check version compared to current version
if latest_version != __version__:
if os.environ.get("RUNNING_IN_DOCKER"):
if os.environ.get('RUNNING_IN_DOCKER'):
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
else:
update_cmd = "`pip install --upgrade auto-archiver`"
@ -442,36 +351,33 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
logger.warning("")
def setup(self, args: list):
"""
Function to configure all setup of the orchestrator: setup configs and load modules.
This method should only ever be called once
"""
self.check_for_updates()
if self.setup_finished:
logger.warning(
"The `setup_config()` function should only ever be run once. \
logger.warning("The `setup_config()` function should only ever be run once. \
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
For code implementatations, you should call .setup_config() once then you may call .feed() \
multiple times to archive multiple URLs."
)
multiple times to archive multiple URLs.")
return
self.setup_basic_parser()
self.config = self.setup_config(args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules(self.config["steps"])
self.install_modules(self.config['steps'])
# log out the modules that were loaded
for module_type in MODULE_TYPES:
logger.info(
f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
)
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
self.setup_finished = True
def _command_line_run(self, args: list) -> Generator[Metadata]:
@ -479,9 +385,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
This is the main entry point for the orchestrator, when run from the command line.
:param args: list of arguments to pass to the orchestrator - these are the command line args
You should not call this method from code implementations.
This method sets up the configuration, loads the modules, and runs the feed.
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
To test configurations, without loading any modules you can also first call 'setup_configs'
@ -499,6 +405,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
e.cleanup()
def feed(self) -> Generator[Metadata]:
url_count = 0
for feeder in self.feeders:
for item in feeder:
@ -529,9 +436,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
self.cleanup()
exit()
except Exception as e:
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases:
if isinstance(e, AssertionError):
if type(e) == AssertionError:
d.failed(item, str(e))
else:
d.failed(item, reason="unexpected error")
@ -544,13 +451,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
def archive(self, result: Metadata) -> Union[Metadata, None]:
"""
Runs the archiving process for a single URL
1. Each archiver can sanitize its own URLs
2. Check for cached results in Databases, and signal start to the databases
3. Call Archivers until one succeeds
4. Call Enrichers
5. Store all downloaded/generated media
6. Call selected Formatter and store formatted if needed
Runs the archiving process for a single URL
1. Each archiver can sanitize its own URLs
2. Check for cached results in Databases, and signal start to the databases
3. Call Archivers until one succeeds
4. Call Enrichers
5. Store all downloaded/generated media
6. Call selected Formatter and store formatted if needed
"""
original_url = result.get_url().strip()
@ -566,8 +473,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
url = a.sanitize_url(url)
result.set_url(url)
if original_url != url:
result.set("original_url", original_url)
if original_url != url: result.set("original_url", original_url)
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
cached_result = None
@ -578,8 +484,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
if cached_result:
logger.debug("Found previously archived entry")
for d in self.databases:
try:
d.done(cached_result, cached=True)
try: d.done(cached_result, cached=True)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
return cached_result
@ -589,15 +494,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
logger.info(f"Trying extractor {a.name} for {url}")
try:
result.merge(a.download(result))
if result.is_success():
break
if result.is_success(): break
except Exception as e:
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
# 4 - call enrichers to work with archived content
for e in self.enrichers:
try:
e.enrich(result)
try: e.enrich(result)
except Exception as exc:
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
@ -615,12 +518,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
# signal completion to databases and archivers
for d in self.databases:
try:
d.done(result)
try: d.done(result)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
return result
def setup_authentication(self, config: dict) -> dict:
"""
@ -629,7 +532,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
Split up strings into multiple sites if they are comma separated
"""
authentication = config.get("authentication", {})
authentication = config.get('authentication', {})
# extract out concatenated sites
for key, val in copy(authentication).items():
@ -638,8 +541,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
site = site.strip()
authentication[site] = val
del authentication[key]
config["authentication"] = authentication
config['authentication'] = authentication
return config
# Helper Properties

Wyświetl plik

@ -1,11 +1,29 @@
"""
Base module for Storage modules modular components that store media objects in various locations.
If you are looking to implement a new storage module, you should subclass the `Storage` class and
implement the `get_cdn_url` and `uploadf` methods.
Your module **must** also have two config variables 'path_generator' and 'filename_generator' which
determine how the key is generated for the media object. The 'path_generator' and 'filename_generator'
variables can be set to one of the following values:
- 'flat': A flat structure with no subfolders
- 'url': A structure based on the URL of the media object
- 'random': A random structure
The 'filename_generator' variable can be set to one of the following values:
- 'random': A random string
- 'static': A replicable strategy such as a hash
If you don't want to use this naming convention, you can override the `set_key` method in your subclass.
"""
from __future__ import annotations
from abc import abstractmethod
from typing import IO
import os
import platform
from loguru import logger
from slugify import slugify
@ -15,18 +33,19 @@ from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
class Storage(BaseModule):
"""
Base class for implementing storage modules in the media archiving framework.
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
"""
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
if media.is_stored(in_storage=self):
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
if media.is_stored(in_storage=self):
logger.debug(f"{media.key} already stored, skipping")
return
self.set_key(media, url, metadata)
self.upload(media, metadata=metadata)
media.add_url(self.get_cdn_url(media))
@ -42,43 +61,56 @@ class Storage(BaseModule):
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
"""
Uploads (or saves) a file to the storage service/location.
This method should not be called directly, but instead through the 'store' method,
which sets up the media for storage.
"""
pass
def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
with open(media.filename, "rb") as f:
"""
Uploads a media object to the storage service.
This method should not be called directly, but instead be called through the 'store' method,
which sets up the media for storage.
"""
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, url, metadata: Metadata) -> None:
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0:
# media key is already set
return
folder = metadata.get_context("folder", "")
folder = metadata.get_context('folder', '')
filename, ext = os.path.splitext(media.filename)
# Handle path_generator logic
path_generator = self.config.get("path_generator", "url")
path_generator = self.path_generator
if path_generator == "flat":
path = ""
filename = slugify(filename) # Ensure filename is slugified
elif path_generator == "url":
path = slugify(url)
path = slugify(url)[:70]
elif path_generator == "random":
path = self.config.get("random_path", random_str(24), True)
path = random_str(24)
else:
raise ValueError(f"Invalid path_generator: {path_generator}")
# Handle filename_generator logic
filename_generator = self.config.get("filename_generator", "random")
filename_generator = self.filename_generator
if filename_generator == "random":
filename = random_str(24)
elif filename_generator == "static":
# load the hash_enricher module
he = self.module_factory.get_module(HashEnricher, self.config)
he = self.module_factory.get_module("hash_enricher", self.config)
hd = he.calculate_hash(media.filename)
filename = hd[:24]
else:
raise ValueError(f"Invalid filename_generator: {filename_generator}")
key = os.path.join(folder, path, f"{filename}{ext}")
media.key = os.path.join(folder, path, f"{filename}{ext}")
media._key = key

Wyświetl plik

@ -23,9 +23,9 @@
"help": "which group of users have access to the archive in case public=false as author",
},
"use_api_cache": {
"default": True,
"default": False,
"type": "bool",
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
},
"store_results": {
"default": True,

Wyświetl plik

@ -17,7 +17,6 @@ class CLIFeeder(Feeder):
for url in urls:
logger.debug(f"Processing {url}")
m = Metadata().set_url(url)
m.set_context("folder", "cli")
yield m
logger.success(f"Processed {len(urls)} URL(s)")

Wyświetl plik

@ -1,3 +1,4 @@
import shutil
from typing import IO
import os
@ -5,25 +6,42 @@ from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.core.consts import SetupError
class LocalStorage(Storage):
def setup(self) -> None:
if len(self.save_to) > 200:
raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key)
dest = media.key
if self.save_absolute:
dest = os.path.abspath(dest)
return dest
def set_key(self, media, url, metadata):
# clarify we want to save the file to the save_to folder
old_folder = metadata.get('folder', '')
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
super().set_key(media, url, metadata)
# don't impact other storages that might want a different 'folder' set
metadata.set_context('folder', old_folder)
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
dest = os.path.join(self.save_to, media.key)
dest = media.key
os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
res = shutil.copy2(media.filename, dest)
logger.info(res)
return True
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
pass
pass

Wyświetl plik

@ -1,3 +1,4 @@
from typing import IO
import boto3
@ -10,36 +11,33 @@ from auto_archiver.utils.misc import calculate_file_hash, random_str
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage):
def setup(self) -> None:
self.s3 = boto3.client(
"s3",
's3',
region_name=self.region,
endpoint_url=self.endpoint_url.format(region=self.region),
aws_access_key_id=self.key,
aws_secret_access_key=self.secret,
aws_secret_access_key=self.secret
)
if self.random_no_duplicate:
logger.warning(
"random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`."
)
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
def get_cdn_url(self, media: Media) -> str:
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
if not self.is_upload_needed(media):
return True
if not self.is_upload_needed(media): return True
extra_args = kwargs.get("extra_args", {})
if not self.private and "ACL" not in extra_args:
extra_args["ACL"] = "public-read"
if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read'
if "ContentType" not in extra_args:
if 'ContentType' not in extra_args:
try:
if media.mimetype:
extra_args["ContentType"] = media.mimetype
extra_args['ContentType'] = media.mimetype
except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
@ -51,21 +49,21 @@ class S3Storage(Storage):
hd = calculate_file_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key := self.file_in_folder(path):
media.key = existing_key
if existing_key:=self.file_in_folder(path):
media._key = existing_key
media.set("previously archived", True)
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
media._key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path: str) -> str:
def file_in_folder(self, path:str) -> str:
# checks if path exists and is not an empty folder
if not path.endswith("/"):
path = path + "/"
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter="/", MaxKeys=1)
if "Contents" in resp:
return resp["Contents"][0]["Key"]
return False
if not path.endswith('/'):
path = path + '/'
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
if 'Contents' in resp:
return resp['Contents'][0]['Key']
return False

Wyświetl plik

@ -0,0 +1 @@
from .tiktok_tikwm_extractor import TiktokTikwmExtractor

Wyświetl plik

@ -0,0 +1,23 @@
{
"name": "Tiktok Tikwm Extractor",
"type": ["extractor"],
"requires_setup": False,
"dependencies": {
"python": ["loguru", "requests"],
"bin": []
},
"description": """
Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
This extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.
### Features
- Downloads the video and, if possible, also the video cover.
- Stores extra metadata about the post like author information, and more as returned by tikwm.com.
### Notes
- If tikwm.com is down, this extractor will not work.
- If tikwm.com changes their API, this extractor may break.
- If no video is found, this extractor will consider the extraction failed.
"""
}

Wyświetl plik

@ -0,0 +1,75 @@
import re
import requests
from loguru import logger
from datetime import datetime, timezone
from yt_dlp.extractor.tiktok import TikTokIE
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
class TiktokTikwmExtractor(Extractor):
"""
Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
"""
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if not re.match(TikTokIE._VALID_URL, url):
return False
endpoint = TiktokTikwmExtractor.TIKWM_ENDPOINT.format(url=url)
r = requests.get(endpoint)
if r.status_code != 200:
logger.error(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
return False
try:
json_response = r.json()
except ValueError:
logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
return False
if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
return False
# tries to get the non-watermarked version first
video_url = api_data.pop("play", api_data.pop("wmplay", None))
if not video_url:
logger.error(f"no valid video URL found in response from tikwm.com for {url=}")
return False
# prepare result, start by downloading video
result = Metadata()
# get the cover if possible
cover_url = api_data.pop("origin_cover", api_data.pop("cover", api_data.pop("ai_dynamic_cover", None)))
if cover_url and (cover_downloaded := self.download_from_url(cover_url)):
result.add_media(Media(cover_downloaded))
# get the video or fail
video_downloaded = self.download_from_url(video_url, f"vid_{api_data.get('id', '')}")
if not video_downloaded:
logger.error(f"failed to download video from {video_url}")
return False
video_media = Media(video_downloaded)
if duration := api_data.pop("duration", None):
video_media.set("duration", duration)
result.add_media(video_media)
# add remaining metadata
result.set_title(api_data.pop("title", ""))
if created_at := api_data.pop("create_time", None):
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
if (author := api_data.pop("author", None)):
result.set("author", author)
result.set("api_data", api_data)
return result.success("tikwm")

Wyświetl plik

@ -14,8 +14,8 @@ def enricher(setup_module):
def metadata_with_images():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="image1.jpg", key="image1"))
m.add_media(Media(filename="image2.jpg", key="image2"))
m.add_media(Media(filename="image1.jpg", _key="image1"))
m.add_media(Media(filename="image2.jpg", _key="image2"))
return m

Wyświetl plik

@ -0,0 +1,154 @@
from datetime import datetime, timezone
import time
import pytest
from auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor import TiktokTikwmExtractor
from .test_extractor_base import TestExtractorBase
class TestTiktokTikwmExtractor(TestExtractorBase):
"""
Test suite for TestTiktokTikwmExtractor.
"""
extractor_module = "tiktok_tikwm_extractor"
extractor: TiktokTikwmExtractor
config = {}
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
@staticmethod
def get_mockers(mocker):
mock_get = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.requests.get")
mock_logger = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.logger")
return mock_get, mock_logger
@pytest.mark.parametrize("url,valid_url", [
("https://bellingcat.com", False),
("https://youtube.com", False),
("https://tiktok.co/", False),
("https://tiktok.com/", False),
("https://www.tiktok.com/", False),
("https://api.cool.tiktok.com/", False),
(VALID_EXAMPLE_URL, True),
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
])
def test_valid_urls(self, mocker, make_item, url, valid_url):
mock_get, mock_logger = self.get_mockers(mocker)
if valid_url:
mock_get.return_value.status_code = 404
assert self.extractor.download(make_item(url)) == False
assert mock_get.call_count == int(valid_url)
assert mock_logger.error.call_count == int(valid_url)
def test_invalid_json_responses(self, mocker, make_item):
mock_get, mock_logger = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.side_effect = ValueError
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
mock_logger.error.assert_called_once()
assert mock_logger.error.call_args[0][0].startswith("failed to parse JSON response")
mock_get.return_value.json.side_effect = Exception
with pytest.raises(Exception):
self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
mock_get.assert_called()
assert mock_get.call_count == 2
assert mock_get.return_value.json.call_count == 2
@pytest.mark.parametrize("response", [
({"msg": "failure"}),
({"msg": "success"}),
])
def test_unsuccessful_responses(self, mocker, make_item, response):
mock_get, mock_logger = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = response
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
mock_logger.error.assert_called_once()
assert mock_logger.error.call_args[0][0].startswith("failed to get a valid response")
@pytest.mark.parametrize("response,has_vid", [
({"data": {"id": 123}}, False),
({"data": {"wmplay": "url"}}, True),
({"data": {"play": "url"}}, True),
])
def test_correct_extraction(self, mocker, make_item, response, has_vid):
mock_get, mock_logger = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {"msg": "success", **response}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
if not has_vid:
assert result == False
else:
assert result.is_success()
assert len(result.media) == 1
mock_get.assert_called()
assert mock_get.call_count == 1 + int(has_vid)
mock_get.return_value.json.assert_called_once()
if not has_vid:
mock_logger.error.assert_called_once()
assert mock_logger.error.call_args[0][0].startswith("no valid video URL found")
else:
mock_logger.error.assert_not_called()
def test_correct_extraction(self, mocker, make_item):
mock_get, _ = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {"msg": "success", "data": {
"wmplay": "url",
"origin_cover": "cover.jpg",
"title": "Title",
"id": 123,
"duration": 60,
"create_time": 1736301699,
"author": "Author",
"other": "data"
}}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Title"
assert result.get("author") == "Author"
assert result.get("api_data") == {"other": "data", "id": 123}
assert result.media[1].get("duration") == 60
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
@pytest.mark.download
def test_download_video(self, make_item):
url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
assert result.get("author").get("unique_id") == "bbcnews"
assert result.get("api_data").get("id") == '7478038212070411542'
assert result.media[1].get("duration") == 59
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
@pytest.mark.download
def test_download_sensitive_video(self, make_item, mock_sleep):
# sleep is needed because of the rate limit
mock_sleep.stop()
time.sleep(1.1)
mock_sleep.start()
url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
assert result.get("author").get("id") == "7197400619475649562"
assert result.get("api_data").get("id") == '7441821351142362375'
assert result.media[1].get("duration") == 34
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)

Wyświetl plik

@ -8,7 +8,6 @@ class TestS3Storage:
"""
Test suite for S3Storage.
"""
module_name: str = "s3_storage"
storage: Type[S3Storage]
config: dict = {
@ -33,28 +32,28 @@ class TestS3Storage:
"""Test that S3 client is initialized with correct parameters"""
assert self.storage.s3 is not None
assert self.storage.s3.meta.region_name == "test-region"
assert self.storage.s3.meta.region_name == 'test-region'
def test_get_cdn_url_generation(self):
"""Test CDN URL formatting"""
"""Test CDN URL formatting """
media = Media("test.txt")
media.key = "path/to/file.txt"
media._key = "path/to/file.txt"
url = self.storage.get_cdn_url(media)
assert url == "https://cdn.example.com/path/to/file.txt"
media.key = "another/path.jpg"
media._key = "another/path.jpg"
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
def test_uploadf_sets_acl_public(self, mocker):
media = Media("test.txt")
mock_file = mocker.MagicMock()
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
self.storage.uploadf(mock_file, media)
mock_s3_upload.assert_called_once_with(
mock_file,
Bucket="test-bucket",
Bucket='test-bucket',
Key=media.key,
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
)
def test_upload_decision_logic(self, mocker):
@ -62,52 +61,45 @@ class TestS3Storage:
media = Media("test.txt")
assert self.storage.is_upload_needed(media) is True
self.storage.random_no_duplicate = True
mock_calc_hash = mocker.patch(
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
return_value="beepboop123beepboop123beepboop123",
)
mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
assert self.storage.is_upload_needed(media) is False
assert media.key == "existing_key.txt"
mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")
assert media.key == 'existing_key.txt'
mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
def test_skips_upload_when_duplicate_exists(self, mocker):
"""Test that upload skips when file_in_folder finds existing object"""
self.storage.random_no_duplicate = True
mock_file_in_folder = mocker.patch.object(
S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt"
)
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
media = Media("test.txt")
media.key = "original_path.txt"
mock_calculate_hash = mocker.patch(
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
return_value="beepboop123beepboop123beepboop123",
)
media._key = "original_path.txt"
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
assert self.storage.is_upload_needed(media) is False
assert media.key == "existing_folder/existing_file.txt"
assert media.get("previously archived") is True
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
result = self.storage.uploadf(None, media)
mock_upload.assert_not_called()
assert result is True
def test_uploads_with_correct_parameters(self, mocker):
media = Media("test.txt")
media.key = "original_key.txt"
mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
media.mimetype = "image/png"
media._key = "original_key.txt"
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
media.mimetype = 'image/png'
mock_file = mocker.MagicMock()
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
self.storage.uploadf(mock_file, media)
mock_upload.assert_called_once_with(
mock_file,
Bucket="test-bucket",
Key="original_key.txt",
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
Bucket='test-bucket',
Key='original_key.txt',
ExtraArgs={
'ACL': 'public-read',
'ContentType': 'image/png'
}
)
def test_file_in_folder_exists(self, mocker):
mock_list_objects = mocker.patch.object(
self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]}
)
assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"
mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'

Wyświetl plik

@ -44,7 +44,7 @@ def media(tmp_path) -> Media:
file_path.write_bytes(content)
media = Media(filename=str(file_path))
media.properties = {"something": "Title"}
media.key = "key"
media._key = "key"
return media

Wyświetl plik

@ -53,7 +53,7 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
def test_path_parts():
media = Media(filename="test.jpg")
media.key = "folder1/folder2/test.jpg"
media._key = "folder1/folder2/test.jpg"
@pytest.mark.skip(reason="Requires real credentials")

Wyświetl plik

@ -1,15 +1,17 @@
import os
from pathlib import Path
import pytest
from auto_archiver.core import Media
from auto_archiver.core import Media, Metadata
from auto_archiver.modules.local_storage import LocalStorage
from auto_archiver.core.consts import SetupError
@pytest.fixture
def local_storage(setup_module, tmp_path) -> LocalStorage:
save_to = tmp_path / "local_archive"
save_to.mkdir()
configs: dict = {
"path_generator": "flat",
"filename_generator": "static",
@ -18,35 +20,41 @@ def local_storage(setup_module, tmp_path) -> LocalStorage:
}
return setup_module("local_storage", configs)
@pytest.fixture
def sample_media(tmp_path) -> Media:
"""Fixture creating a Media object with temporary source file"""
src_file = tmp_path / "source.txt"
src_file.write_text("test content")
return Media(key="subdir/test.txt", filename=str(src_file))
return Media(filename=str(src_file))
def test_too_long_save_path(setup_module):
with pytest.raises(SetupError):
setup_module("local_storage", {"save_to": "long"*100})
def test_get_cdn_url_relative(local_storage):
media = Media(key="test.txt", filename="dummy.txt")
local_storage.filename_generator = "random"
media = Media(filename="dummy.txt")
local_storage.set_key(media, "https://example.com", Metadata())
expected = os.path.join(local_storage.save_to, media.key)
assert local_storage.get_cdn_url(media) == expected
def test_get_cdn_url_absolute(local_storage):
media = Media(key="test.txt", filename="dummy.txt")
local_storage.filename_generator = "random"
media = Media(filename="dummy.txt")
local_storage.save_absolute = True
local_storage.set_key(media, "https://example.com", Metadata())
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
assert local_storage.get_cdn_url(media) == expected
def test_upload_file_contents_and_metadata(local_storage, sample_media):
local_storage.store(sample_media, "https://example.com", Metadata())
dest = os.path.join(local_storage.save_to, sample_media.key)
assert local_storage.upload(sample_media) is True
assert Path(sample_media.filename).read_text() == Path(dest).read_text()
def test_upload_nonexistent_source(local_storage):
media = Media(key="missing.txt", filename="nonexistent.txt")
media = Media(_key="missing.txt", filename="nonexistent.txt")
with pytest.raises(FileNotFoundError):
local_storage.upload(media)

Wyświetl plik

@ -2,16 +2,94 @@ from typing import Type
import pytest
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.storage import Storage
from auto_archiver.core.module import ModuleFactory
class TestStorageBase(object):
module_name: str = None
config: dict = None
@pytest.fixture(autouse=True)
def setup_storage(self, setup_module):
assert self.module_name is not None, "self.module_name must be set on the subclass"
assert (
self.module_name is not None
), "self.module_name must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.storage: Type[Storage] = setup_module(self.module_name, self.config)
self.storage: Type[Storage] = setup_module(
self.module_name, self.config
)
class TestBaseStorage(Storage):
name = "test_storage"
def get_cdn_url(self, media):
return "cdn_url"
def uploadf(self, file, key, **kwargs):
return True
@pytest.fixture
def dummy_file(tmp_path):
# create dummy.txt file
dummy_file = tmp_path / "dummy.txt"
dummy_file.write_text("test content")
return str(dummy_file)
@pytest.fixture
def storage_base():
def _storage_base(config):
storage_base = TestBaseStorage()
storage_base.config_setup({TestBaseStorage.name : config})
storage_base.module_factory = ModuleFactory()
return storage_base
return _storage_base
@pytest.mark.parametrize(
"path_generator, filename_generator, url, expected_key",
[
("flat", "static", "https://example.com/file/", "folder/6ae8a75555209fd6c44157c0.txt"),
("flat", "random", "https://example.com/file/", "folder/pretend-random.txt"),
("url", "static", "https://example.com/file/", "folder/https-example-com-file/6ae8a75555209fd6c44157c0.txt"),
("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"),
("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"),
("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"),
],
)
def test_storage_name_generation(storage_base, path_generator, filename_generator, url,
expected_key, mocker, tmp_path, dummy_file):
mock_random = mocker.patch("auto_archiver.core.storage.random_str")
mock_random.return_value = "pretend-random"
config: dict = {
"path_generator": path_generator,
"filename_generator": filename_generator,
}
storage: Storage = storage_base(config)
assert storage.path_generator == path_generator
assert storage.filename_generator == filename_generator
metadata = Metadata()
metadata.set_context("folder", "folder")
media = Media(filename=dummy_file)
storage.set_key(media, url, metadata)
print(media.key)
assert media.key == expected_key
def test_really_long_name(storage_base, dummy_file):
config: dict = {
"path_generator": "url",
"filename_generator": "static",
}
storage: Storage = storage_base(config)
url = f"https://example.com/{'file'*100}"
media = Media(filename=dummy_file)
storage.set_key(media, url, Metadata())
assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt"