Merge branch 'load_modules' into docs_update

pull/190/head
Patrick Robertson 2025-02-11 13:09:38 +00:00
commit 7d87b858d6
13 zmienionych plików z 114 dodań i 44 usunięć

Wyświetl plik

@ -63,7 +63,7 @@ class BaseModule(ABC):
def setup(self, config: dict):
authentication = config.get('authentication', {})
# extract out contatenated sites
# extract out concatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):

Wyświetl plik

@ -15,15 +15,9 @@ from .module import BaseModule
from typing import Any, List, Type, Tuple
yaml: YAML = YAML()
_yaml: YAML = YAML()
b = yaml.load("""
# This is a comment
site.com,site2.com:
key: value
key2: value2
""")
EMPTY_CONFIG = yaml.load("""
EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined
@ -36,6 +30,7 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES
# a dictionary of authentication information that can be used by extractors to login to website.
# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
# Common login 'types' are username/password, cookie, api key/token.
# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser.
# Some Examples:
# facebook.com:
# username: "my_username"
@ -148,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
config = None
try:
with open(yaml_filename, "r", encoding="utf-8") as inf:
config = yaml.load(inf)
config = _yaml.load(inf)
except FileNotFoundError:
pass
@ -163,6 +158,6 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save = deepcopy(config)
config.pop('urls', None)
config_to_save.pop('urls', None)
with open(yaml_filename, "w", encoding="utf-8") as outf:
yaml.dump(config_to_save, outf)
_yaml.dump(config_to_save, outf)

Wyświetl plik

@ -13,7 +13,7 @@ import copy
import sys
from importlib.util import find_spec
import os
from os.path import join, dirname
from os.path import join
from loguru import logger
import auto_archiver
from .base_module import BaseModule
@ -64,8 +64,10 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
if module_name in _LAZY_LOADED_MODULES:
return _LAZY_LOADED_MODULES[module_name]
module = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)[0]
return module
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:

Wyświetl plik

@ -20,7 +20,7 @@ from rich_argparse import RichHelpFormatter
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .module import BaseModule
@ -50,7 +50,7 @@ class AuthenticationJsonParseAction(JsonParseAction):
auth_dict = json.load(f)
except json.JSONDecodeError:
# maybe it's yaml, try that
auth_dict = yaml.load(f)
auth_dict = _yaml.load(f)
except:
pass
@ -111,7 +111,6 @@ class ArchivingOrchestrator:
# if full, we'll load all modules
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
# but should we add them? Or should we just add them to the 'complete' parser?
if yaml_config != EMPTY_CONFIG:
# only load the modules enabled in config
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
@ -128,6 +127,10 @@ class ArchivingOrchestrator:
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup
yaml_config['steps']['feeders'] = ['cli_feeder']
# add them to the config
for module in simple_modules:
for module_type in module.type:
@ -237,18 +240,18 @@ class ArchivingOrchestrator:
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
def install_modules(self):
def install_modules(self, modules_by_type):
"""
Swaps out the previous 'strings' in the config with the actual modules and loads them
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
are loaded, the program will exit with an error message.
"""
invalid_modules = []
for module_type in BaseModule.MODULE_TYPES:
step_items = []
modules_to_load = self.config['steps'][f"{module_type}s"]
modules_to_load = modules_by_type[f"{module_type}s"]
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
def check_steps_ok():
@ -264,9 +267,10 @@ class ArchivingOrchestrator:
for module in modules_to_load:
if module == 'cli_feeder':
# pseudo module, don't load it
urls = self.config['urls']
if not urls:
logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.")
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
exit()
# cli_feeder is a pseudo module, it just takes the command line args
def feed(self) -> Generator[Metadata]:
@ -330,7 +334,7 @@ class ArchivingOrchestrator:
self.setup_complete_parser(basic_config, yaml_config, unused_args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules()
self.install_modules(self.config['steps'])
# log out the modules that were loaded
for module_type in BaseModule.MODULE_TYPES:

Wyświetl plik

@ -1,6 +1,6 @@
from __future__ import annotations
from abc import abstractmethod
from typing import IO, Optional
from typing import IO
import os
from loguru import logger

Wyświetl plik

@ -68,7 +68,7 @@ class GsheetsFeeder(Feeder):
folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
m.set_context('folder', folder)
m.set_context('worksheet', {"row": row, "worksheet": gw})
m.set_context('gsheet', {"row": row, "worksheet": gw})
yield m
logger.success(f'Finished worksheet {wks.title}')

Wyświetl plik

@ -12,6 +12,7 @@ from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata
from auto_archiver.utils.misc import calculate_file_hash
class HashEnricher(Enricher):
@ -29,15 +30,10 @@ class HashEnricher(Enricher):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
def calculate_hash(self, filename) -> str:
hash = None
hash_algo = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256()
hash_algo = hashlib.sha256
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512()
hash_algo = hashlib.sha3_512
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()
return calculate_file_hash(filename, hash_algo, self.chunksize)

Wyświetl plik

@ -7,12 +7,11 @@ from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
from auto_archiver.utils.misc import calculate_file_hash, random_str
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage, HashEnricher):
class S3Storage(Storage):
def setup(self, config: dict) -> None:
super().setup(config)
@ -42,14 +41,13 @@ class S3Storage(Storage, HashEnricher):
extra_args['ContentType'] = media.mimetype
except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
hd = self.calculate_hash(media.filename)
hd = calculate_file_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):

Wyświetl plik

@ -1 +1 @@
from .telethon_extractor import TelethonArchiver
from .telethon_extractor import TelethonExtractor

Wyświetl plik

@ -6,14 +6,14 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
from loguru import logger
from tqdm import tqdm
import re, time, json, os
import re, time, os
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
class TelethonArchiver(Extractor):
class TelethonExtractor(Extractor):
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")

Wyświetl plik

@ -5,6 +5,7 @@ import json
import uuid
from datetime import datetime
import requests
import hashlib
from loguru import logger
@ -54,9 +55,21 @@ def update_nested_dict(dictionary, update_dict):
else:
dictionary[key] = value
def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]
def json_loader(cli_val):
return json.loads(cli_val)
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
hash = hash_algo()
with open(filename, "rb") as f:
while True:
buf = f.read(chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()

Wyświetl plik

@ -0,0 +1,62 @@
import sys
import pytest
from auto_archiver.__main__ import main
@pytest.fixture
def orchestration_file_path(tmp_path):
return (tmp_path / "example_orch.yaml").as_posix()
@pytest.fixture
def orchestration_file(orchestration_file_path):
def _orchestration_file(content=''):
with open(orchestration_file_path, "w") as f:
f.write(content)
return orchestration_file_path
return _orchestration_file
@pytest.fixture
def autoarchiver(tmp_path, monkeypatch, request):
def _autoarchiver(args=[]):
def cleanup():
from loguru import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0
logger.add(sys.stderr)
request.addfinalizer(cleanup)
# change dir to tmp_path
monkeypatch.chdir(tmp_path)
with monkeypatch.context() as m:
m.setattr(sys, "argv", ["auto-archiver"] + args)
return main()
return _autoarchiver
def test_run_auto_archiver_no_args(caplog, autoarchiver):
with pytest.raises(SystemExit):
autoarchiver()
assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text
def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
# exec 'auto-archiver' on the command lin
with pytest.raises(SystemExit):
autoarchiver(["--config", "nonexistent_file.yaml"])
assert "Make sure the file exists and try again, or run without th" in caplog.text
def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
# create a valid (empty) orchestration file
path = orchestration_file(content="")
# exec 'auto-archiver' on the command lin
with pytest.raises(SystemExit):
autoarchiver(["--config", path])
# should treat an empty file as if there is no file at all
assert " No URLs provided. Please provide at least one URL via the com" in caplog.text