kopia lustrzana https://github.com/bellingcat/auto-archiver
Further fixes/changes to loading 'types' for config + manifest edits
rodzic
14e2479599
commit
7fd95866a1
|
@ -10,13 +10,6 @@ from auto_archiver.core import Metadata, Step
|
|||
class Database(Step, ABC):
|
||||
|
||||
name = "database"
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def init(name: str, config: dict) -> Database:
|
||||
# only for typing...
|
||||
return Step.init(name, config, Database)
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
"""signals the DB that the given item archival has started"""
|
||||
|
|
|
@ -18,14 +18,5 @@ class Enricher(Step, ABC):
|
|||
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
|
||||
name = "enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Enricher:
|
||||
return Step.init(name, config, Enricher)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
||||
|
|
|
@ -9,13 +9,5 @@ from auto_archiver.core import Step
|
|||
class Feeder(Step):
|
||||
name = "feeder"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def init(name: str, config: dict) -> Feeder:
|
||||
# only for code typing
|
||||
return Step.init(name, config, Feeder)
|
||||
|
||||
@abstractmethod
|
||||
def __iter__(self) -> Metadata: return None
|
|
@ -6,16 +6,12 @@ flexible setup in various environments.
|
|||
"""
|
||||
|
||||
import argparse
|
||||
from ruamel.yaml import YAML, CommentedMap
|
||||
from ruamel.yaml.comments import CommentedMap
|
||||
from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable
|
||||
from copy import deepcopy
|
||||
from .loader import MODULE_TYPES
|
||||
|
||||
from typing import Any, List
|
||||
from typing import Any, List, Type
|
||||
|
||||
# configurable_parents = [
|
||||
# Feeder,
|
||||
|
|
|
@ -152,12 +152,12 @@ class ArchivingOrchestrator:
|
|||
if not modules:
|
||||
modules = available_modules(with_manifest=True)
|
||||
|
||||
module: Module
|
||||
for module in modules:
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
continue
|
||||
|
||||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
for name, kwargs in module.configs.items():
|
||||
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
|
||||
|
@ -165,8 +165,8 @@ class ArchivingOrchestrator:
|
|||
kwargs.pop('cli_set', None)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
try:
|
||||
kwargs['type'] = getattr(__builtins__, kwargs.get('type', 'str'))
|
||||
except AttributeError:
|
||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||
except KeyError:
|
||||
kwargs['type'] = getattr(validators, kwargs['type'])
|
||||
group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
|
||||
|
@ -207,7 +207,7 @@ class ArchivingOrchestrator:
|
|||
exit()
|
||||
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
logger.error(f"Only one feeder is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
exit()
|
||||
|
||||
for i, module in enumerate(modules_to_load):
|
||||
|
|
|
@ -5,10 +5,12 @@
|
|||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
"configs": {
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"nargs": "+",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -7,12 +7,6 @@ from auto_archiver.core import Metadata, ArchivingContext
|
|||
class CLIFeeder(Feeder):
|
||||
name = "cli_feeder"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
if type(self.urls) != list or len(self.urls) == 0:
|
||||
raise Exception("CLI Feeder did not receive any URL to process")
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
|
|
|
@ -8,11 +8,6 @@ class ConsoleDb(Database):
|
|||
"""
|
||||
Outputs results to the console
|
||||
"""
|
||||
name = "console_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
"requires_setup": False,
|
||||
"external_dependencies": {"python": ["loguru"]
|
||||
},
|
||||
'entry_point': 'csv_db::CSVDb',
|
||||
"configs": {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
},
|
||||
|
|
|
@ -11,13 +11,6 @@ class CSVDb(Database):
|
|||
"""
|
||||
Outputs results to a CSV file
|
||||
"""
|
||||
name = "csv_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("csv_file")
|
||||
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
"python": ["loguru"],
|
||||
"bin": [""]
|
||||
},
|
||||
'requires_setup': True,
|
||||
'entry_point': "csv_feeder::CSVFeeder",
|
||||
"configs": {
|
||||
"files": {
|
||||
"default": None,
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
'name': 'Generic Extractor',
|
||||
'version': '0.1.0',
|
||||
'author': 'Bellingcat',
|
||||
'type': ['extractor', 'feeder', 'enricher'],
|
||||
'type': ['extractor'],
|
||||
'requires_setup': False,
|
||||
'dependencies': {
|
||||
'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"name": "Google Sheets Feeder",
|
||||
"type": ["feeder"],
|
||||
"entry_point": "GsheetsFeeder",
|
||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
|
|
|
@ -8,7 +8,10 @@
|
|||
"configs": {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
# TODO add non-negative requirement to match previous implementation?
|
||||
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
"chunksize": {"default": 1.6e7,
|
||||
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
|
||||
'type': 'positive_number',
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Generates cryptographic hashes for media files to ensure data integrity and authenticity.
|
||||
|
|
|
@ -18,36 +18,6 @@ class HashEnricher(Enricher):
|
|||
"""
|
||||
Calculates hashes for Media instances
|
||||
"""
|
||||
name = "hash_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
algos = self.configs()["algorithm"]
|
||||
algo_choices = algos["choices"]
|
||||
if not getattr(self, 'algorithm', None):
|
||||
if not config.get('algorithm'):
|
||||
logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
|
||||
self.algorithm = algos["default"]
|
||||
else:
|
||||
self.algorithm = config["algorithm"]
|
||||
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
|
||||
if not getattr(self, 'chunksize', None):
|
||||
if config.get('chunksize'):
|
||||
self.chunksize = config["chunksize"]
|
||||
else:
|
||||
self.chunksize = self.configs()["chunksize"]["default"]
|
||||
|
||||
try:
|
||||
self.chunksize = int(self.chunksize)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.")
|
||||
|
||||
assert self.chunksize >= -1, "read length must be non-negative or -1"
|
||||
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
|
|
@ -16,17 +16,17 @@ from auto_archiver.utils.misc import random_str
|
|||
|
||||
@dataclass
|
||||
class HtmlFormatter(Formatter):
|
||||
name = "html_formatter"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
|
||||
# JinjaHelper class static methods are added as filters
|
||||
self.environment.filters.update({
|
||||
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
|
||||
})
|
||||
self.template = self.environment.get_template("html_template.html")
|
||||
# TODO: fix setting up template with new config method
|
||||
# def __init__(self, config: dict) -> None:
|
||||
# # without this STEP.__init__ is not called
|
||||
# super().__init__(config)
|
||||
# self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
|
||||
# # JinjaHelper class static methods are added as filters
|
||||
# self.environment.filters.update({
|
||||
# k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
|
||||
# })
|
||||
# self.template = self.environment.get_template("html_template.html")
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
||||
|
|
|
@ -1 +1 @@
|
|||
from .local import LocalStorage
|
||||
from .local_storage import LocalStorage
|
|
@ -11,9 +11,10 @@ from auto_archiver.base_processors import Storage
|
|||
class LocalStorage(Storage):
|
||||
name = "local_storage"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
os.makedirs(self.save_to, exist_ok=True)
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
# TODO: fix up passing config values to 'steps'
|
||||
# os.makedirs(self.save_to, exist_ok=True)
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
# TODO: is this viable with Storage.configs on path/filename?
|
|
@ -10,12 +10,6 @@ class MetaEnricher(Enricher):
|
|||
"""
|
||||
Adds metadata information about the archive operations, to be included at the end of all enrichments
|
||||
"""
|
||||
name = "meta_enricher"
|
||||
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
|
|
@ -10,11 +10,6 @@ class MetadataEnricher(Enricher):
|
|||
"""
|
||||
Extracts metadata information from files using exiftool.
|
||||
"""
|
||||
name = "metadata_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
m = {
|
||||
{
|
||||
"name": "Mute Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": False,
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
},
|
||||
"description": """ Default formatter.
|
||||
|
|
|
@ -1,16 +1,12 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..core import Metadata, Media
|
||||
from . import Formatter
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.base_processors import Formatter
|
||||
|
||||
|
||||
@dataclass
|
||||
class MuteFormatter(Formatter):
|
||||
name = "mute_formatter"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def format(self, item: Metadata) -> Media: return None
|
||||
|
|
|
@ -25,11 +25,6 @@ class PdqHashEnricher(Enricher):
|
|||
Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection.
|
||||
Ideally this enrichment is orchestrated to run after the thumbnail_enricher.
|
||||
"""
|
||||
name = "pdq_hash_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# Without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
|
||||
"random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
"external_dependencies": {
|
||||
"python": ["loguru", "python-slugify"],
|
||||
},
|
||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
},
|
||||
|
|
|
@ -11,11 +11,6 @@ class SSLEnricher(Enricher):
|
|||
"""
|
||||
Retrieves SSL certificate information for a domain, as a file
|
||||
"""
|
||||
name = "ssl_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not to_enrich.media and self.skip_when_nothing_archived: return
|
||||
|
|
|
@ -11,11 +11,6 @@ class TelegramExtractor(Extractor):
|
|||
Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
|
||||
will only return if at least one image or one video is found
|
||||
"""
|
||||
name = "telegram_extractor"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import json
|
||||
{
|
||||
"name": "telethon_extractor",
|
||||
"type": ["extractor"],
|
||||
|
@ -42,4 +41,4 @@ To use the `TelethonExtractor`, you must configure the following:
|
|||
- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
|
||||
|
||||
"""
|
||||
}
|
||||
}
|
|
@ -18,13 +18,6 @@ class ThumbnailEnricher(Enricher):
|
|||
"""
|
||||
Generates thumbnails for all the media
|
||||
"""
|
||||
name = "thumbnail_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60
|
||||
self.max_thumbnails = int(self.max_thumbnails)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
"""
|
||||
|
|
Ładowanie…
Reference in New Issue