kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'load_modules' into docs_update
commit
7d87b858d6
|
@ -63,7 +63,7 @@ class BaseModule(ABC):
|
|||
def setup(self, config: dict):
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
# extract out contatenated sites
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
|
|
|
@ -15,15 +15,9 @@ from .module import BaseModule
|
|||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
yaml: YAML = YAML()
|
||||
_yaml: YAML = YAML()
|
||||
|
||||
b = yaml.load("""
|
||||
# This is a comment
|
||||
site.com,site2.com:
|
||||
key: value
|
||||
key2: value2
|
||||
""")
|
||||
EMPTY_CONFIG = yaml.load("""
|
||||
EMPTY_CONFIG = _yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
||||
|
@ -36,6 +30,7 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES
|
|||
# a dictionary of authentication information that can be used by extractors to login to website.
|
||||
# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
|
||||
# Common login 'types' are username/password, cookie, api key/token.
|
||||
# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser.
|
||||
# Some Examples:
|
||||
# facebook.com:
|
||||
# username: "my_username"
|
||||
|
@ -148,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
|||
config = None
|
||||
try:
|
||||
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
||||
config = yaml.load(inf)
|
||||
config = _yaml.load(inf)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
@ -163,6 +158,6 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
|||
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
config_to_save = deepcopy(config)
|
||||
|
||||
config.pop('urls', None)
|
||||
config_to_save.pop('urls', None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
yaml.dump(config_to_save, outf)
|
||||
_yaml.dump(config_to_save, outf)
|
|
@ -13,7 +13,7 @@ import copy
|
|||
import sys
|
||||
from importlib.util import find_spec
|
||||
import os
|
||||
from os.path import join, dirname
|
||||
from os.path import join
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
from .base_module import BaseModule
|
||||
|
@ -64,8 +64,10 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
|
|||
if module_name in _LAZY_LOADED_MODULES:
|
||||
return _LAZY_LOADED_MODULES[module_name]
|
||||
|
||||
module = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)[0]
|
||||
return module
|
||||
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
return available[0]
|
||||
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ from rich_argparse import RichHelpFormatter
|
|||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .module import BaseModule
|
||||
|
@ -50,7 +50,7 @@ class AuthenticationJsonParseAction(JsonParseAction):
|
|||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = yaml.load(f)
|
||||
auth_dict = _yaml.load(f)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
@ -111,7 +111,6 @@ class ArchivingOrchestrator:
|
|||
# if full, we'll load all modules
|
||||
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
|
||||
# but should we add them? Or should we just add them to the 'complete' parser?
|
||||
|
||||
if yaml_config != EMPTY_CONFIG:
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
|
@ -128,6 +127,10 @@ class ArchivingOrchestrator:
|
|||
elif basic_config.mode == 'simple':
|
||||
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
|
||||
self.add_module_args(simple_modules, parser)
|
||||
|
||||
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
||||
yaml_config['steps']['feeders'] = ['cli_feeder']
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
|
@ -237,18 +240,18 @@ class ArchivingOrchestrator:
|
|||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
|
||||
|
||||
def install_modules(self):
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
Swaps out the previous 'strings' in the config with the actual modules and loads them
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = self.config['steps'][f"{module_type}s"]
|
||||
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
|
||||
def check_steps_ok():
|
||||
|
@ -264,9 +267,10 @@ class ArchivingOrchestrator:
|
|||
|
||||
for module in modules_to_load:
|
||||
if module == 'cli_feeder':
|
||||
# pseudo module, don't load it
|
||||
urls = self.config['urls']
|
||||
if not urls:
|
||||
logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.")
|
||||
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
exit()
|
||||
# cli_feeder is a pseudo module, it just takes the command line args
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
@ -330,7 +334,7 @@ class ArchivingOrchestrator:
|
|||
self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules()
|
||||
self.install_modules(self.config['steps'])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import IO, Optional
|
||||
from typing import IO
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
|
|
|
@ -68,7 +68,7 @@ class GsheetsFeeder(Feeder):
|
|||
folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
|
||||
|
||||
m.set_context('folder', folder)
|
||||
m.set_context('worksheet', {"row": row, "worksheet": gw})
|
||||
m.set_context('gsheet', {"row": row, "worksheet": gw})
|
||||
yield m
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
|
|
@ -12,6 +12,7 @@ from loguru import logger
|
|||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils.misc import calculate_file_hash
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
|
@ -29,15 +30,10 @@ class HashEnricher(Enricher):
|
|||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||
|
||||
def calculate_hash(self, filename) -> str:
|
||||
hash = None
|
||||
hash_algo = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256()
|
||||
hash_algo = hashlib.sha256
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512()
|
||||
hash_algo = hashlib.sha3_512
|
||||
else: return ""
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(self.chunksize)
|
||||
if not buf: break
|
||||
hash.update(buf)
|
||||
return hash.hexdigest()
|
||||
return calculate_file_hash(filename, hash_algo, self.chunksize)
|
||||
|
|
|
@ -7,12 +7,11 @@ from loguru import logger
|
|||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.utils.misc import calculate_file_hash, random_str
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
|
||||
class S3Storage(Storage, HashEnricher):
|
||||
class S3Storage(Storage):
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
super().setup(config)
|
||||
|
@ -42,14 +41,13 @@ class S3Storage(Storage, HashEnricher):
|
|||
extra_args['ContentType'] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
hd = self.calculate_hash(media.filename)
|
||||
hd = calculate_file_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
|
|
|
@ -1 +1 @@
|
|||
from .telethon_extractor import TelethonArchiver
|
||||
from .telethon_extractor import TelethonExtractor
|
|
@ -6,14 +6,14 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
|
|||
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
import re, time, os
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class TelethonArchiver(Extractor):
|
||||
class TelethonExtractor(Extractor):
|
||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ import json
|
|||
import uuid
|
||||
from datetime import datetime
|
||||
import requests
|
||||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
@ -54,9 +55,21 @@ def update_nested_dict(dictionary, update_dict):
|
|||
else:
|
||||
dictionary[key] = value
|
||||
|
||||
|
||||
def random_str(length: int = 32) -> str:
|
||||
assert length <= 32, "length must be less than 32 as UUID4 is used"
|
||||
return str(uuid.uuid4()).replace("-", "")[:length]
|
||||
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
|
||||
|
||||
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
|
||||
hash = hash_algo()
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(chunksize)
|
||||
if not buf: break
|
||||
hash.update(buf)
|
||||
return hash.hexdigest()
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
import sys
|
||||
import pytest
|
||||
|
||||
from auto_archiver.__main__ import main
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def orchestration_file_path(tmp_path):
|
||||
return (tmp_path / "example_orch.yaml").as_posix()
|
||||
|
||||
@pytest.fixture
|
||||
def orchestration_file(orchestration_file_path):
|
||||
def _orchestration_file(content=''):
|
||||
with open(orchestration_file_path, "w") as f:
|
||||
f.write(content)
|
||||
return orchestration_file_path
|
||||
|
||||
return _orchestration_file
|
||||
|
||||
@pytest.fixture
|
||||
def autoarchiver(tmp_path, monkeypatch, request):
|
||||
def _autoarchiver(args=[]):
|
||||
|
||||
def cleanup():
|
||||
from loguru import logger
|
||||
if not logger._core.handlers.get(0):
|
||||
logger._core.handlers_count = 0
|
||||
logger.add(sys.stderr)
|
||||
|
||||
request.addfinalizer(cleanup)
|
||||
|
||||
# change dir to tmp_path
|
||||
monkeypatch.chdir(tmp_path)
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(sys, "argv", ["auto-archiver"] + args)
|
||||
return main()
|
||||
|
||||
return _autoarchiver
|
||||
|
||||
|
||||
def test_run_auto_archiver_no_args(caplog, autoarchiver):
|
||||
with pytest.raises(SystemExit):
|
||||
autoarchiver()
|
||||
|
||||
assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text
|
||||
|
||||
def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
|
||||
# exec 'auto-archiver' on the command lin
|
||||
with pytest.raises(SystemExit):
|
||||
autoarchiver(["--config", "nonexistent_file.yaml"])
|
||||
|
||||
assert "Make sure the file exists and try again, or run without th" in caplog.text
|
||||
|
||||
def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
|
||||
# create a valid (empty) orchestration file
|
||||
path = orchestration_file(content="")
|
||||
# exec 'auto-archiver' on the command lin
|
||||
with pytest.raises(SystemExit):
|
||||
autoarchiver(["--config", path])
|
||||
|
||||
# should treat an empty file as if there is no file at all
|
||||
assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
|
Ładowanie…
Reference in New Issue