kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'main' into linting_etc
# Conflicts: # src/auto_archiver/core/consts.py # src/auto_archiver/core/orchestrator.py # src/auto_archiver/core/storage.py # src/auto_archiver/modules/local_storage/local_storage.py # src/auto_archiver/modules/s3_storage/s3_storage.py # tests/storages/test_S3_storage.py # tests/storages/test_local_storage.py # tests/storages/test_storage_base.pypull/244/head
commit
441f341139
|
@ -10,7 +10,7 @@
|
|||
|
||||
|
||||
|
||||
Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.
|
||||
Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can be enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.
|
||||
|
||||
<div class="hidden_rtd">
|
||||
|
||||
|
|
|
@ -1,15 +1,25 @@
|
|||
MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
DEFAULT_MANIFEST = {
|
||||
"name": "", # the display name of the module
|
||||
"author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name!
|
||||
"type": [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
"requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
"description": "", # a description of the module
|
||||
"dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
"entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
"version": "1.0", # the version of the module
|
||||
"configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ nested media retrieval, and type validation.
|
|||
from __future__ import annotations
|
||||
import os
|
||||
import traceback
|
||||
from typing import Any, List
|
||||
from typing import Any, List, Iterator
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
@ -21,15 +21,14 @@ class Media:
|
|||
Represents a media file with associated properties and storage details.
|
||||
|
||||
Attributes:
|
||||
- filename: The file path of the media.
|
||||
- key: An optional identifier for the media.
|
||||
- filename: The file path of the media as saved locally (temporarily, before uploading to the storage).
|
||||
- urls: A list of URLs where the media is stored or accessible.
|
||||
- properties: Additional metadata or transformations for the media.
|
||||
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
||||
"""
|
||||
|
||||
filename: str
|
||||
key: str = None
|
||||
_key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
properties: dict = field(default_factory=dict)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
|
@ -48,7 +47,7 @@ class Media:
|
|||
for any_media in self.all_inner_media(include_self=True):
|
||||
s.store(any_media, url, metadata=metadata)
|
||||
|
||||
def all_inner_media(self, include_self=False):
|
||||
def all_inner_media(self, include_self=False) -> Iterator[Media]:
|
||||
"""Retrieves all media, including nested media within properties or transformations on original media.
|
||||
This function returns a generator for all the inner media.
|
||||
|
||||
|
@ -69,6 +68,10 @@ class Media:
|
|||
# checks if the media is already stored in the given storage
|
||||
return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
return self._key
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
return self
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
"""Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
""" Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
|
||||
"""
|
||||
|
||||
|
@ -19,32 +19,19 @@ import requests
|
|||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import (
|
||||
read_yaml,
|
||||
store_yaml,
|
||||
to_dot_notation,
|
||||
merge_dicts,
|
||||
is_valid_config,
|
||||
DefaultValidatingParser,
|
||||
UniqueAppendAction,
|
||||
AuthenticationJsonParseAction,
|
||||
DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES
|
||||
from .consts import MODULE_TYPES, SetupError
|
||||
from auto_archiver.utils.url import check_url_or_raise
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
# instance variables
|
||||
module_factory: ModuleFactory
|
||||
setup_finished: bool
|
||||
|
@ -74,63 +61,30 @@ class ArchivingOrchestrator:
|
|||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
action="store",
|
||||
dest="config_file",
|
||||
help="the filename of the YAML configuration file (defaults to 'config.yaml')",
|
||||
default=DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
action="store",
|
||||
dest="mode",
|
||||
type=str,
|
||||
choices=["simple", "full"],
|
||||
help="the mode to run the archiver in",
|
||||
default="simple",
|
||||
)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--store",
|
||||
dest="store",
|
||||
default=False,
|
||||
help="Store the created config in the config file",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--module_paths",
|
||||
dest="module_paths",
|
||||
nargs="+",
|
||||
default=[],
|
||||
help="additional paths to search for modules",
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
|
||||
def check_steps(self, config):
|
||||
for module_type in MODULE_TYPES:
|
||||
if not config["steps"].get(f"{module_type}s", []):
|
||||
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
|
||||
raise SetupError(
|
||||
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
|
||||
)
|
||||
if module_type == "extractor" and config["steps"].get("archivers"):
|
||||
raise SetupError(
|
||||
"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
|
||||
)
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
if not config['steps'].get(f"{module_type}s", []):
|
||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
||||
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
|
@ -138,9 +92,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
self.add_modules_args(modules_parser)
|
||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||
for module_type in MODULE_TYPES:
|
||||
yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
|
||||
"steps"
|
||||
].get(f"{module_type}s", [])
|
||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
||||
|
||||
parser = DefaultValidatingParser(
|
||||
add_help=False,
|
||||
|
@ -163,32 +115,30 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
enabled_modules = []
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = self.module_factory.available_modules(
|
||||
limit_to_modules=enabled_modules, suppress_warnings=True
|
||||
)
|
||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
self.add_individual_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == "simple":
|
||||
elif basic_config.mode == 'simple':
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
all_modules = self.module_factory.available_modules()
|
||||
# add all the modules to the steps
|
||||
for module in all_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
|
||||
self.add_individual_module_args(all_modules, parser)
|
||||
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
|
@ -214,76 +164,43 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
store_yaml(config, basic_config.config_file)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def add_modules_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# Module loading from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
parser.add_argument(
|
||||
f"--{module_type}s",
|
||||
dest=f"{module_type}s",
|
||||
nargs="+",
|
||||
help=f"the {module_type}s to use",
|
||||
default=[],
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
||||
|
||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
parser.add_argument(
|
||||
"--authentication",
|
||||
dest="authentication",
|
||||
help="A dictionary of sites and their authentication methods \
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.",
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction,
|
||||
)
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction)
|
||||
|
||||
# logging arguments
|
||||
parser.add_argument(
|
||||
"--logging.level",
|
||||
action="store",
|
||||
dest="logging.level",
|
||||
choices=["INFO", "DEBUG", "ERROR", "WARNING"],
|
||||
help="the logging level to use",
|
||||
default="INFO",
|
||||
type=str.upper,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.rotation",
|
||||
action="store",
|
||||
dest="logging.rotation",
|
||||
help="the logging rotation to use",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
|
||||
def add_individual_module_args(
|
||||
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
|
||||
) -> None:
|
||||
if not modules:
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
|
||||
for module in modules:
|
||||
if module.name == "cli_feeder":
|
||||
if module.name == 'cli_feeder':
|
||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||
parser.add_argument(
|
||||
"urls",
|
||||
nargs="*",
|
||||
default=[],
|
||||
help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
)
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
continue
|
||||
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
|
@ -292,21 +209,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
|
||||
for name, kwargs in module.configs.items():
|
||||
if not kwargs.get("metavar", None):
|
||||
if not kwargs.get('metavar', None):
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs["metavar"] = name.upper()
|
||||
kwargs['metavar'] = name.upper()
|
||||
|
||||
if kwargs.get("required", False):
|
||||
if kwargs.get('required', False):
|
||||
# required args shouldn't have a 'default' value, remove it
|
||||
kwargs.pop("default", None)
|
||||
kwargs.pop('default', None)
|
||||
|
||||
kwargs.pop("cli_set", None)
|
||||
should_store = kwargs.pop("should_store", False)
|
||||
kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
try:
|
||||
kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
|
||||
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
|
||||
except AttributeError:
|
||||
kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
|
||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
arg.should_store = should_store
|
||||
|
||||
|
@ -321,11 +238,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self, config):
|
||||
logging_config = config["logging"]
|
||||
|
||||
if logging_config.get("enabled", True) is False:
|
||||
logging_config = config['logging']
|
||||
|
||||
if logging_config.get('enabled', True) is False:
|
||||
# disabled logging settings, they're set on a higher level
|
||||
logger.disable("auto_archiver")
|
||||
logger.disable('auto_archiver')
|
||||
return
|
||||
|
||||
# setup loguru logging
|
||||
|
@ -335,45 +253,38 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
pass
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
|
||||
if log_file := logging_config["file"]:
|
||||
logger.add(log_file) if not logging_config["rotation"] else logger.add(
|
||||
log_file, rotation=logging_config["rotation"]
|
||||
)
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
if not modules_to_load:
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
if len(modules_to_load):
|
||||
logger.error(
|
||||
f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
|
||||
)
|
||||
raise SetupError(
|
||||
f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
|
||||
)
|
||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
|
||||
|
||||
if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
|
||||
raise SetupError(
|
||||
f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
|
||||
)
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
|
||||
for module in modules_to_load:
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
|
@ -382,7 +293,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if loaded_module and module_type == "extractor":
|
||||
if loaded_module and module_type == 'extractor':
|
||||
loaded_module.cleanup()
|
||||
raise e
|
||||
|
||||
|
@ -397,13 +308,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(
|
||||
f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
|
||||
)
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
|
||||
def setup_config(self, args: list) -> dict:
|
||||
"""
|
||||
Sets up the configuration file, merging the default config with the user's config
|
||||
|
@ -426,13 +335,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
yaml_config = self.load_config(basic_config.config_file)
|
||||
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
latest_version = response["info"]["version"]
|
||||
latest_version = response['info']['version']
|
||||
# check version compared to current version
|
||||
if latest_version != __version__:
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
|
@ -442,36 +351,33 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
|
||||
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
self.check_for_updates()
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning(
|
||||
"The `setup_config()` function should only ever be run once. \
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||
multiple times to archive multiple URLs."
|
||||
)
|
||||
multiple times to archive multiple URLs.")
|
||||
return
|
||||
|
||||
self.setup_basic_parser()
|
||||
self.config = self.setup_config(args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules(self.config["steps"])
|
||||
self.install_modules(self.config['steps'])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(
|
||||
f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
|
||||
)
|
||||
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
self.setup_finished = True
|
||||
|
||||
def _command_line_run(self, args: list) -> Generator[Metadata]:
|
||||
|
@ -479,9 +385,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
This is the main entry point for the orchestrator, when run from the command line.
|
||||
|
||||
:param args: list of arguments to pass to the orchestrator - these are the command line args
|
||||
|
||||
|
||||
You should not call this method from code implementations.
|
||||
|
||||
|
||||
This method sets up the configuration, loads the modules, and runs the feed.
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
|
@ -499,6 +405,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
|
@ -529,9 +436,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
for d in self.databases:
|
||||
if isinstance(e, AssertionError):
|
||||
if type(e) == AssertionError:
|
||||
d.failed(item, str(e))
|
||||
else:
|
||||
d.failed(item, reason="unexpected error")
|
||||
|
@ -544,13 +451,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
"""
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
|
@ -566,8 +473,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
url = a.sanitize_url(url)
|
||||
|
||||
result.set_url(url)
|
||||
if original_url != url:
|
||||
result.set("original_url", original_url)
|
||||
if original_url != url: result.set("original_url", original_url)
|
||||
|
||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||
cached_result = None
|
||||
|
@ -578,8 +484,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
if cached_result:
|
||||
logger.debug("Found previously archived entry")
|
||||
for d in self.databases:
|
||||
try:
|
||||
d.done(cached_result, cached=True)
|
||||
try: d.done(cached_result, cached=True)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
return cached_result
|
||||
|
@ -589,15 +494,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
logger.info(f"Trying extractor {a.name} for {url}")
|
||||
try:
|
||||
result.merge(a.download(result))
|
||||
if result.is_success():
|
||||
break
|
||||
if result.is_success(): break
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# 4 - call enrichers to work with archived content
|
||||
for e in self.enrichers:
|
||||
try:
|
||||
e.enrich(result)
|
||||
try: e.enrich(result)
|
||||
except Exception as exc:
|
||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
|
@ -615,12 +518,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
|
||||
# signal completion to databases and archivers
|
||||
for d in self.databases:
|
||||
try:
|
||||
d.done(result)
|
||||
try: d.done(result)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def setup_authentication(self, config: dict) -> dict:
|
||||
"""
|
||||
|
@ -629,7 +532,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
Split up strings into multiple sites if they are comma separated
|
||||
"""
|
||||
|
||||
authentication = config.get("authentication", {})
|
||||
authentication = config.get('authentication', {})
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
|
@ -638,8 +541,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
site = site.strip()
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
config["authentication"] = authentication
|
||||
|
||||
config['authentication'] = authentication
|
||||
return config
|
||||
|
||||
# Helper Properties
|
||||
|
|
|
@ -1,11 +1,29 @@
|
|||
"""
|
||||
Base module for Storage modules – modular components that store media objects in various locations.
|
||||
|
||||
If you are looking to implement a new storage module, you should subclass the `Storage` class and
|
||||
implement the `get_cdn_url` and `uploadf` methods.
|
||||
|
||||
Your module **must** also have two config variables 'path_generator' and 'filename_generator' which
|
||||
determine how the key is generated for the media object. The 'path_generator' and 'filename_generator'
|
||||
variables can be set to one of the following values:
|
||||
- 'flat': A flat structure with no subfolders
|
||||
- 'url': A structure based on the URL of the media object
|
||||
- 'random': A random structure
|
||||
|
||||
The 'filename_generator' variable can be set to one of the following values:
|
||||
- 'random': A random string
|
||||
- 'static': A replicable strategy such as a hash
|
||||
|
||||
If you don't want to use this naming convention, you can override the `set_key` method in your subclass.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import IO
|
||||
import os
|
||||
import platform
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
@ -15,18 +33,19 @@ from auto_archiver.utils.misc import random_str
|
|||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
|
||||
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing storage modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||
"""
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
|
||||
self.set_key(media, url, metadata)
|
||||
self.upload(media, metadata=metadata)
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
@ -42,43 +61,56 @@ class Storage(BaseModule):
|
|||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
"""
|
||||
Uploads (or saves) a file to the storage service/location.
|
||||
|
||||
This method should not be called directly, but instead through the 'store' method,
|
||||
which sets up the media for storage.
|
||||
"""
|
||||
pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||
with open(media.filename, "rb") as f:
|
||||
"""
|
||||
Uploads a media object to the storage service.
|
||||
|
||||
This method should not be called directly, but instead be called through the 'store' method,
|
||||
which sets up the media for storage.
|
||||
"""
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url, metadata: Metadata) -> None:
|
||||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
|
||||
if media.key is not None and len(media.key) > 0:
|
||||
# media key is already set
|
||||
return
|
||||
folder = metadata.get_context("folder", "")
|
||||
|
||||
folder = metadata.get_context('folder', '')
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
path_generator = self.config.get("path_generator", "url")
|
||||
path_generator = self.path_generator
|
||||
if path_generator == "flat":
|
||||
path = ""
|
||||
filename = slugify(filename) # Ensure filename is slugified
|
||||
elif path_generator == "url":
|
||||
path = slugify(url)
|
||||
path = slugify(url)[:70]
|
||||
elif path_generator == "random":
|
||||
path = self.config.get("random_path", random_str(24), True)
|
||||
path = random_str(24)
|
||||
else:
|
||||
raise ValueError(f"Invalid path_generator: {path_generator}")
|
||||
|
||||
# Handle filename_generator logic
|
||||
filename_generator = self.config.get("filename_generator", "random")
|
||||
filename_generator = self.filename_generator
|
||||
if filename_generator == "random":
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
he = self.module_factory.get_module(HashEnricher, self.config)
|
||||
he = self.module_factory.get_module("hash_enricher", self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
media._key = key
|
|
@ -23,9 +23,9 @@
|
|||
"help": "which group of users have access to the archive in case public=false as author",
|
||||
},
|
||||
"use_api_cache": {
|
||||
"default": True,
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
||||
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
||||
},
|
||||
"store_results": {
|
||||
"default": True,
|
||||
|
|
|
@ -17,7 +17,6 @@ class CLIFeeder(Feeder):
|
|||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
m.set_context("folder", "cli")
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
import shutil
|
||||
from typing import IO
|
||||
import os
|
||||
|
@ -5,25 +6,42 @@ from loguru import logger
|
|||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
class LocalStorage(Storage):
|
||||
|
||||
|
||||
def setup(self) -> None:
|
||||
if len(self.save_to) > 200:
|
||||
raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
# TODO: is this viable with Storage.configs on path/filename?
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
dest = media.key
|
||||
|
||||
if self.save_absolute:
|
||||
dest = os.path.abspath(dest)
|
||||
return dest
|
||||
|
||||
def set_key(self, media, url, metadata):
|
||||
# clarify we want to save the file to the save_to folder
|
||||
|
||||
old_folder = metadata.get('folder', '')
|
||||
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
|
||||
super().set_key(media, url, metadata)
|
||||
# don't impact other storages that might want a different 'folder' set
|
||||
metadata.set_context('folder', old_folder)
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
# override parent so that we can use shutil.copy2 and keep metadata
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
dest = media.key
|
||||
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
|
||||
|
||||
res = shutil.copy2(media.filename, dest)
|
||||
logger.info(res)
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
||||
pass
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
from typing import IO
|
||||
|
||||
import boto3
|
||||
|
@ -10,36 +11,33 @@ from auto_archiver.utils.misc import calculate_file_hash, random_str
|
|||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
|
||||
|
||||
class S3Storage(Storage):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.s3 = boto3.client(
|
||||
"s3",
|
||||
's3',
|
||||
region_name=self.region,
|
||||
endpoint_url=self.endpoint_url.format(region=self.region),
|
||||
aws_access_key_id=self.key,
|
||||
aws_secret_access_key=self.secret,
|
||||
aws_secret_access_key=self.secret
|
||||
)
|
||||
if self.random_no_duplicate:
|
||||
logger.warning(
|
||||
"random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`."
|
||||
)
|
||||
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||
|
||||
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
|
||||
if not self.is_upload_needed(media):
|
||||
return True
|
||||
if not self.is_upload_needed(media): return True
|
||||
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and "ACL" not in extra_args:
|
||||
extra_args["ACL"] = "public-read"
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
|
||||
if "ContentType" not in extra_args:
|
||||
if 'ContentType' not in extra_args:
|
||||
try:
|
||||
if media.mimetype:
|
||||
extra_args["ContentType"] = media.mimetype
|
||||
extra_args['ContentType'] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
|
@ -51,21 +49,21 @@ class S3Storage(Storage):
|
|||
hd = calculate_file_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key := self.file_in_folder(path):
|
||||
media.key = existing_key
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
media._key = existing_key
|
||||
media.set("previously archived", True)
|
||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
||||
return False
|
||||
|
||||
|
||||
_, ext = os.path.splitext(media.key)
|
||||
media.key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
media._key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
return True
|
||||
|
||||
def file_in_folder(self, path: str) -> str:
|
||||
def file_in_folder(self, path:str) -> str:
|
||||
# checks if path exists and is not an empty folder
|
||||
if not path.endswith("/"):
|
||||
path = path + "/"
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter="/", MaxKeys=1)
|
||||
if "Contents" in resp:
|
||||
return resp["Contents"][0]["Key"]
|
||||
return False
|
||||
if not path.endswith('/'):
|
||||
path = path + '/'
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
||||
if 'Contents' in resp:
|
||||
return resp['Contents'][0]['Key']
|
||||
return False
|
|
@ -0,0 +1 @@
|
|||
from .tiktok_tikwm_extractor import TiktokTikwmExtractor
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"name": "Tiktok Tikwm Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
"bin": []
|
||||
},
|
||||
"description": """
|
||||
Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
|
||||
|
||||
This extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.
|
||||
|
||||
### Features
|
||||
- Downloads the video and, if possible, also the video cover.
|
||||
- Stores extra metadata about the post like author information, and more as returned by tikwm.com.
|
||||
|
||||
### Notes
|
||||
- If tikwm.com is down, this extractor will not work.
|
||||
- If tikwm.com changes their API, this extractor may break.
|
||||
- If no video is found, this extractor will consider the extraction failed.
|
||||
"""
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
import re
|
||||
import requests
|
||||
from loguru import logger
|
||||
from datetime import datetime, timezone
|
||||
from yt_dlp.extractor.tiktok import TikTokIE
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class TiktokTikwmExtractor(Extractor):
|
||||
"""
|
||||
Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
|
||||
"""
|
||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
if not re.match(TikTokIE._VALID_URL, url):
|
||||
return False
|
||||
|
||||
endpoint = TiktokTikwmExtractor.TIKWM_ENDPOINT.format(url=url)
|
||||
|
||||
r = requests.get(endpoint)
|
||||
if r.status_code != 200:
|
||||
logger.error(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
|
||||
return False
|
||||
|
||||
try:
|
||||
json_response = r.json()
|
||||
except ValueError:
|
||||
logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
|
||||
return False
|
||||
|
||||
if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
|
||||
logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
|
||||
return False
|
||||
|
||||
# tries to get the non-watermarked version first
|
||||
video_url = api_data.pop("play", api_data.pop("wmplay", None))
|
||||
if not video_url:
|
||||
logger.error(f"no valid video URL found in response from tikwm.com for {url=}")
|
||||
return False
|
||||
|
||||
# prepare result, start by downloading video
|
||||
result = Metadata()
|
||||
|
||||
# get the cover if possible
|
||||
cover_url = api_data.pop("origin_cover", api_data.pop("cover", api_data.pop("ai_dynamic_cover", None)))
|
||||
if cover_url and (cover_downloaded := self.download_from_url(cover_url)):
|
||||
result.add_media(Media(cover_downloaded))
|
||||
|
||||
# get the video or fail
|
||||
video_downloaded = self.download_from_url(video_url, f"vid_{api_data.get('id', '')}")
|
||||
if not video_downloaded:
|
||||
logger.error(f"failed to download video from {video_url}")
|
||||
return False
|
||||
video_media = Media(video_downloaded)
|
||||
if duration := api_data.pop("duration", None):
|
||||
video_media.set("duration", duration)
|
||||
result.add_media(video_media)
|
||||
|
||||
# add remaining metadata
|
||||
result.set_title(api_data.pop("title", ""))
|
||||
|
||||
if created_at := api_data.pop("create_time", None):
|
||||
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
||||
|
||||
if (author := api_data.pop("author", None)):
|
||||
result.set("author", author)
|
||||
|
||||
result.set("api_data", api_data)
|
||||
|
||||
return result.success("tikwm")
|
|
@ -14,8 +14,8 @@ def enricher(setup_module):
|
|||
def metadata_with_images():
|
||||
m = Metadata()
|
||||
m.set_url("https://example.com")
|
||||
m.add_media(Media(filename="image1.jpg", key="image1"))
|
||||
m.add_media(Media(filename="image2.jpg", key="image2"))
|
||||
m.add_media(Media(filename="image1.jpg", _key="image1"))
|
||||
m.add_media(Media(filename="image2.jpg", _key="image2"))
|
||||
return m
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
from datetime import datetime, timezone
|
||||
import time
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor import TiktokTikwmExtractor
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
"""
|
||||
Test suite for TestTiktokTikwmExtractor.
|
||||
"""
|
||||
|
||||
extractor_module = "tiktok_tikwm_extractor"
|
||||
extractor: TiktokTikwmExtractor
|
||||
|
||||
config = {}
|
||||
|
||||
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
|
||||
|
||||
@staticmethod
|
||||
def get_mockers(mocker):
|
||||
mock_get = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.requests.get")
|
||||
mock_logger = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.logger")
|
||||
return mock_get, mock_logger
|
||||
|
||||
@pytest.mark.parametrize("url,valid_url", [
|
||||
("https://bellingcat.com", False),
|
||||
("https://youtube.com", False),
|
||||
("https://tiktok.co/", False),
|
||||
("https://tiktok.com/", False),
|
||||
("https://www.tiktok.com/", False),
|
||||
("https://api.cool.tiktok.com/", False),
|
||||
(VALID_EXAMPLE_URL, True),
|
||||
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
|
||||
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||
])
|
||||
def test_valid_urls(self, mocker, make_item, url, valid_url):
|
||||
mock_get, mock_logger = self.get_mockers(mocker)
|
||||
if valid_url:
|
||||
mock_get.return_value.status_code = 404
|
||||
assert self.extractor.download(make_item(url)) == False
|
||||
assert mock_get.call_count == int(valid_url)
|
||||
assert mock_logger.error.call_count == int(valid_url)
|
||||
|
||||
def test_invalid_json_responses(self, mocker, make_item):
|
||||
mock_get, mock_logger = self.get_mockers(mocker)
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.side_effect = ValueError
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
mock_logger.error.assert_called_once()
|
||||
assert mock_logger.error.call_args[0][0].startswith("failed to parse JSON response")
|
||||
|
||||
mock_get.return_value.json.side_effect = Exception
|
||||
with pytest.raises(Exception):
|
||||
self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 2
|
||||
assert mock_get.return_value.json.call_count == 2
|
||||
|
||||
@pytest.mark.parametrize("response", [
|
||||
({"msg": "failure"}),
|
||||
({"msg": "success"}),
|
||||
])
|
||||
def test_unsuccessful_responses(self, mocker, make_item, response):
|
||||
mock_get, mock_logger = self.get_mockers(mocker)
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = response
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
mock_logger.error.assert_called_once()
|
||||
assert mock_logger.error.call_args[0][0].startswith("failed to get a valid response")
|
||||
|
||||
@pytest.mark.parametrize("response,has_vid", [
|
||||
({"data": {"id": 123}}, False),
|
||||
({"data": {"wmplay": "url"}}, True),
|
||||
({"data": {"play": "url"}}, True),
|
||||
])
|
||||
def test_correct_extraction(self, mocker, make_item, response, has_vid):
|
||||
mock_get, mock_logger = self.get_mockers(mocker)
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {"msg": "success", **response}
|
||||
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
if not has_vid:
|
||||
assert result == False
|
||||
else:
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 1
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 1 + int(has_vid)
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
if not has_vid:
|
||||
mock_logger.error.assert_called_once()
|
||||
assert mock_logger.error.call_args[0][0].startswith("no valid video URL found")
|
||||
else:
|
||||
mock_logger.error.assert_not_called()
|
||||
|
||||
def test_correct_extraction(self, mocker, make_item):
|
||||
mock_get, _ = self.get_mockers(mocker)
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {"msg": "success", "data": {
|
||||
"wmplay": "url",
|
||||
"origin_cover": "cover.jpg",
|
||||
"title": "Title",
|
||||
"id": 123,
|
||||
"duration": 60,
|
||||
"create_time": 1736301699,
|
||||
"author": "Author",
|
||||
"other": "data"
|
||||
}}
|
||||
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Title"
|
||||
assert result.get("author") == "Author"
|
||||
assert result.get("api_data") == {"other": "data", "id": 123}
|
||||
assert result.media[1].get("duration") == 60
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_video(self, make_item):
|
||||
url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
|
||||
|
||||
result = self.extractor.download(make_item(url))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
|
||||
assert result.get("author").get("unique_id") == "bbcnews"
|
||||
assert result.get("api_data").get("id") == '7478038212070411542'
|
||||
assert result.media[1].get("duration") == 59
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_sensitive_video(self, make_item, mock_sleep):
|
||||
# sleep is needed because of the rate limit
|
||||
mock_sleep.stop()
|
||||
time.sleep(1.1)
|
||||
mock_sleep.start()
|
||||
|
||||
url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
|
||||
|
||||
result = self.extractor.download(make_item(url))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
|
||||
assert result.get("author").get("id") == "7197400619475649562"
|
||||
assert result.get("api_data").get("id") == '7441821351142362375'
|
||||
assert result.media[1].get("duration") == 34
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
|
|
@ -8,7 +8,6 @@ class TestS3Storage:
|
|||
"""
|
||||
Test suite for S3Storage.
|
||||
"""
|
||||
|
||||
module_name: str = "s3_storage"
|
||||
storage: Type[S3Storage]
|
||||
config: dict = {
|
||||
|
@ -33,28 +32,28 @@ class TestS3Storage:
|
|||
"""Test that S3 client is initialized with correct parameters"""
|
||||
|
||||
assert self.storage.s3 is not None
|
||||
assert self.storage.s3.meta.region_name == "test-region"
|
||||
assert self.storage.s3.meta.region_name == 'test-region'
|
||||
|
||||
def test_get_cdn_url_generation(self):
|
||||
"""Test CDN URL formatting"""
|
||||
"""Test CDN URL formatting """
|
||||
media = Media("test.txt")
|
||||
media.key = "path/to/file.txt"
|
||||
media._key = "path/to/file.txt"
|
||||
url = self.storage.get_cdn_url(media)
|
||||
assert url == "https://cdn.example.com/path/to/file.txt"
|
||||
media.key = "another/path.jpg"
|
||||
media._key = "another/path.jpg"
|
||||
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
|
||||
|
||||
def test_uploadf_sets_acl_public(self, mocker):
|
||||
media = Media("test.txt")
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
|
||||
self.storage.uploadf(mock_file, media)
|
||||
mock_s3_upload.assert_called_once_with(
|
||||
mock_file,
|
||||
Bucket="test-bucket",
|
||||
Bucket='test-bucket',
|
||||
Key=media.key,
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
|
||||
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
|
||||
)
|
||||
|
||||
def test_upload_decision_logic(self, mocker):
|
||||
|
@ -62,52 +61,45 @@ class TestS3Storage:
|
|||
media = Media("test.txt")
|
||||
assert self.storage.is_upload_needed(media) is True
|
||||
self.storage.random_no_duplicate = True
|
||||
mock_calc_hash = mocker.patch(
|
||||
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
|
||||
return_value="beepboop123beepboop123beepboop123",
|
||||
)
|
||||
mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
|
||||
mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
|
||||
mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
|
||||
assert self.storage.is_upload_needed(media) is False
|
||||
assert media.key == "existing_key.txt"
|
||||
mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")
|
||||
assert media.key == 'existing_key.txt'
|
||||
mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
|
||||
|
||||
def test_skips_upload_when_duplicate_exists(self, mocker):
|
||||
"""Test that upload skips when file_in_folder finds existing object"""
|
||||
self.storage.random_no_duplicate = True
|
||||
mock_file_in_folder = mocker.patch.object(
|
||||
S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt"
|
||||
)
|
||||
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
|
||||
media = Media("test.txt")
|
||||
media.key = "original_path.txt"
|
||||
mock_calculate_hash = mocker.patch(
|
||||
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
|
||||
return_value="beepboop123beepboop123beepboop123",
|
||||
)
|
||||
media._key = "original_path.txt"
|
||||
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
|
||||
assert self.storage.is_upload_needed(media) is False
|
||||
assert media.key == "existing_folder/existing_file.txt"
|
||||
assert media.get("previously archived") is True
|
||||
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
result = self.storage.uploadf(None, media)
|
||||
mock_upload.assert_not_called()
|
||||
assert result is True
|
||||
|
||||
def test_uploads_with_correct_parameters(self, mocker):
|
||||
media = Media("test.txt")
|
||||
media.key = "original_key.txt"
|
||||
mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
|
||||
media.mimetype = "image/png"
|
||||
media._key = "original_key.txt"
|
||||
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
|
||||
media.mimetype = 'image/png'
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
self.storage.uploadf(mock_file, media)
|
||||
mock_upload.assert_called_once_with(
|
||||
mock_file,
|
||||
Bucket="test-bucket",
|
||||
Key="original_key.txt",
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
|
||||
Bucket='test-bucket',
|
||||
Key='original_key.txt',
|
||||
ExtraArgs={
|
||||
'ACL': 'public-read',
|
||||
'ContentType': 'image/png'
|
||||
}
|
||||
)
|
||||
|
||||
def test_file_in_folder_exists(self, mocker):
|
||||
mock_list_objects = mocker.patch.object(
|
||||
self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]}
|
||||
)
|
||||
assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"
|
||||
mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
|
||||
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
|
||||
|
|
|
@ -44,7 +44,7 @@ def media(tmp_path) -> Media:
|
|||
file_path.write_bytes(content)
|
||||
media = Media(filename=str(file_path))
|
||||
media.properties = {"something": "Title"}
|
||||
media.key = "key"
|
||||
media._key = "key"
|
||||
return media
|
||||
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
|
|||
|
||||
def test_path_parts():
|
||||
media = Media(filename="test.jpg")
|
||||
media.key = "folder1/folder2/test.jpg"
|
||||
media._key = "folder1/folder2/test.jpg"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Requires real credentials")
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.modules.local_storage import LocalStorage
|
||||
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
@pytest.fixture
|
||||
def local_storage(setup_module, tmp_path) -> LocalStorage:
|
||||
save_to = tmp_path / "local_archive"
|
||||
save_to.mkdir()
|
||||
configs: dict = {
|
||||
"path_generator": "flat",
|
||||
"filename_generator": "static",
|
||||
|
@ -18,35 +20,41 @@ def local_storage(setup_module, tmp_path) -> LocalStorage:
|
|||
}
|
||||
return setup_module("local_storage", configs)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_media(tmp_path) -> Media:
|
||||
"""Fixture creating a Media object with temporary source file"""
|
||||
src_file = tmp_path / "source.txt"
|
||||
src_file.write_text("test content")
|
||||
return Media(key="subdir/test.txt", filename=str(src_file))
|
||||
return Media(filename=str(src_file))
|
||||
|
||||
def test_too_long_save_path(setup_module):
|
||||
with pytest.raises(SetupError):
|
||||
setup_module("local_storage", {"save_to": "long"*100})
|
||||
|
||||
def test_get_cdn_url_relative(local_storage):
|
||||
media = Media(key="test.txt", filename="dummy.txt")
|
||||
local_storage.filename_generator = "random"
|
||||
media = Media(filename="dummy.txt")
|
||||
local_storage.set_key(media, "https://example.com", Metadata())
|
||||
expected = os.path.join(local_storage.save_to, media.key)
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
|
||||
def test_get_cdn_url_absolute(local_storage):
|
||||
media = Media(key="test.txt", filename="dummy.txt")
|
||||
local_storage.filename_generator = "random"
|
||||
|
||||
media = Media(filename="dummy.txt")
|
||||
local_storage.save_absolute = True
|
||||
local_storage.set_key(media, "https://example.com", Metadata())
|
||||
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
|
||||
def test_upload_file_contents_and_metadata(local_storage, sample_media):
|
||||
local_storage.store(sample_media, "https://example.com", Metadata())
|
||||
dest = os.path.join(local_storage.save_to, sample_media.key)
|
||||
assert local_storage.upload(sample_media) is True
|
||||
assert Path(sample_media.filename).read_text() == Path(dest).read_text()
|
||||
|
||||
|
||||
def test_upload_nonexistent_source(local_storage):
|
||||
media = Media(key="missing.txt", filename="nonexistent.txt")
|
||||
media = Media(_key="missing.txt", filename="nonexistent.txt")
|
||||
with pytest.raises(FileNotFoundError):
|
||||
local_storage.upload(media)
|
||||
|
||||
|
||||
|
|
|
@ -2,16 +2,94 @@ from typing import Type
|
|||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.storage import Storage
|
||||
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
|
||||
class TestStorageBase(object):
|
||||
|
||||
module_name: str = None
|
||||
config: dict = None
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_storage(self, setup_module):
|
||||
assert self.module_name is not None, "self.module_name must be set on the subclass"
|
||||
assert (
|
||||
self.module_name is not None
|
||||
), "self.module_name must be set on the subclass"
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
self.storage: Type[Storage] = setup_module(self.module_name, self.config)
|
||||
self.storage: Type[Storage] = setup_module(
|
||||
self.module_name, self.config
|
||||
)
|
||||
|
||||
|
||||
class TestBaseStorage(Storage):
|
||||
|
||||
name = "test_storage"
|
||||
|
||||
def get_cdn_url(self, media):
|
||||
return "cdn_url"
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
return True
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_file(tmp_path):
|
||||
# create dummy.txt file
|
||||
dummy_file = tmp_path / "dummy.txt"
|
||||
dummy_file.write_text("test content")
|
||||
return str(dummy_file)
|
||||
|
||||
@pytest.fixture
|
||||
def storage_base():
|
||||
def _storage_base(config):
|
||||
storage_base = TestBaseStorage()
|
||||
storage_base.config_setup({TestBaseStorage.name : config})
|
||||
storage_base.module_factory = ModuleFactory()
|
||||
return storage_base
|
||||
|
||||
return _storage_base
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"path_generator, filename_generator, url, expected_key",
|
||||
[
|
||||
("flat", "static", "https://example.com/file/", "folder/6ae8a75555209fd6c44157c0.txt"),
|
||||
("flat", "random", "https://example.com/file/", "folder/pretend-random.txt"),
|
||||
("url", "static", "https://example.com/file/", "folder/https-example-com-file/6ae8a75555209fd6c44157c0.txt"),
|
||||
("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"),
|
||||
("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"),
|
||||
("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"),
|
||||
|
||||
],
|
||||
)
|
||||
def test_storage_name_generation(storage_base, path_generator, filename_generator, url,
|
||||
expected_key, mocker, tmp_path, dummy_file):
|
||||
mock_random = mocker.patch("auto_archiver.core.storage.random_str")
|
||||
mock_random.return_value = "pretend-random"
|
||||
|
||||
config: dict = {
|
||||
"path_generator": path_generator,
|
||||
"filename_generator": filename_generator,
|
||||
}
|
||||
storage: Storage = storage_base(config)
|
||||
assert storage.path_generator == path_generator
|
||||
assert storage.filename_generator == filename_generator
|
||||
|
||||
metadata = Metadata()
|
||||
metadata.set_context("folder", "folder")
|
||||
media = Media(filename=dummy_file)
|
||||
storage.set_key(media, url, metadata)
|
||||
print(media.key)
|
||||
assert media.key == expected_key
|
||||
|
||||
|
||||
def test_really_long_name(storage_base, dummy_file):
|
||||
config: dict = {
|
||||
"path_generator": "url",
|
||||
"filename_generator": "static",
|
||||
}
|
||||
storage: Storage = storage_base(config)
|
||||
|
||||
url = f"https://example.com/{'file'*100}"
|
||||
media = Media(filename=dummy_file)
|
||||
storage.set_key(media, url, Metadata())
|
||||
assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt"
|
Ładowanie…
Reference in New Issue