kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'main' into feat/yt-dlp-pots
commit
b5dc1854a2
|
@ -22,7 +22,7 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
@ -33,14 +33,14 @@ jobs:
|
|||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567
|
||||
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@369eb591f429131d6889c46b94e711f089e6ca96
|
||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
|
||||
with:
|
||||
images: bellingcat/auto-archiver
|
||||
|
||||
|
|
|
@ -10,9 +10,6 @@ on:
|
|||
workflows: ["Core Tests"]
|
||||
types:
|
||||
- completed
|
||||
branches: [main]
|
||||
paths:
|
||||
- src/**
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
|
@ -30,7 +27,10 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.workflow_run.head_branch || github.ref }}
|
||||
# For PRs, use the head commit SHA from the triggering workflow
|
||||
ref: ${{ github.event.workflow_run.head_sha || github.ref }}
|
||||
# If PR is from a fork, we need fetch-depth: 0
|
||||
fetch-depth: ${{ github.event.workflow_run.head_repository.fork && '0' || '1' }}
|
||||
|
||||
- name: Install poetry
|
||||
run: pipx install poetry
|
||||
|
|
|
@ -214,7 +214,7 @@ class LazyBaseModule:
|
|||
|
||||
# check external dependencies are installed
|
||||
def check_deps(deps, check):
|
||||
for dep in filter(lambda d: len(d.strip()), deps):
|
||||
for dep in filter(lambda d: len(d.strip()) > 0, deps):
|
||||
if not check(dep.strip()):
|
||||
logger.error(
|
||||
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
|
@ -274,6 +274,9 @@ class LazyBaseModule:
|
|||
# finally, get the class instance
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
|
||||
# save the instance for future easy loading
|
||||
self._instance = instance
|
||||
|
||||
# set the name, display name and module factory
|
||||
instance.name = self.name
|
||||
instance.display_name = self.display_name
|
||||
|
@ -286,8 +289,6 @@ class LazyBaseModule:
|
|||
instance.config_setup(config)
|
||||
instance.setup()
|
||||
|
||||
# save the instance for future easy loading
|
||||
self._instance = instance
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
|
|
|
@ -387,8 +387,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
|||
except (KeyboardInterrupt, Exception) as e:
|
||||
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if loaded_module and module_type == "extractor":
|
||||
loaded_module.cleanup()
|
||||
|
||||
# access the _instance here because loaded_module may not return if there's an error
|
||||
if lazy_module._instance and module_type == "extractor":
|
||||
lazy_module._instance.cleanup()
|
||||
raise e
|
||||
|
||||
if not loaded_module:
|
||||
|
|
|
@ -4,12 +4,6 @@ import argparse
|
|||
import json
|
||||
|
||||
|
||||
def example_validator(value):
|
||||
if "example" not in value:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
|
||||
return value
|
||||
|
||||
|
||||
def positive_number(value):
|
||||
if value < 0:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
},
|
||||
"session_file": {
|
||||
"default": "secrets/anon",
|
||||
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
|
||||
"help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.",
|
||||
},
|
||||
"join_channels": {
|
||||
"default": True,
|
||||
|
|
|
@ -1,4 +1,10 @@
|
|||
import os
|
||||
import shutil
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import date
|
||||
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.errors import ChannelInvalidError
|
||||
from telethon.tl.functions.messages import ImportChatInviteRequest
|
||||
|
@ -8,11 +14,9 @@ from telethon.errors.rpcerrorlist import (
|
|||
InviteRequestSentError,
|
||||
InviteHashExpiredError,
|
||||
)
|
||||
from loguru import logger
|
||||
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
@ -31,10 +35,22 @@ class TelethonExtractor(Extractor):
|
|||
"""
|
||||
logger.info(f"SETUP {self.name} checking login...")
|
||||
|
||||
# in case the user already added '.session' to the session_file
|
||||
base_session_name = self.session_file.removesuffix(".session")
|
||||
base_session_filepath = f"{base_session_name}.session"
|
||||
|
||||
if self.session_file and not os.path.exists(base_session_filepath):
|
||||
logger.warning(
|
||||
f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one."
|
||||
)
|
||||
Path(base_session_filepath).touch()
|
||||
|
||||
# make a copy of the session that is used exclusively with this archiver instance
|
||||
new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
|
||||
shutil.copy(self.session_file + ".session", new_session_file)
|
||||
self.session_file = new_session_file.replace(".session", "")
|
||||
self.session_file = os.path.join(
|
||||
os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}"
|
||||
)
|
||||
logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session")
|
||||
shutil.copy(base_session_filepath, f"{self.session_file}.session")
|
||||
|
||||
# initiate the client
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
|
@ -87,8 +103,8 @@ class TelethonExtractor(Extractor):
|
|||
pbar.update()
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info(f"CLEANUP {self.name}.")
|
||||
session_file_name = self.session_file + ".session"
|
||||
logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session")
|
||||
session_file_name = f"{self.session_file}.session"
|
||||
if os.path.exists(session_file_name):
|
||||
os.remove(session_file_name)
|
||||
|
||||
|
|
|
@ -17,7 +17,24 @@ from auto_archiver.core.module import ModuleFactory
|
|||
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
|
||||
# what comes first will be run first (at the end of all other tests not mentioned)
|
||||
# format is the name of the module (python file) without the .py extension
|
||||
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
|
||||
TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
|
||||
|
||||
|
||||
# don't check for ytdlp updates in tests
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_check_for_update(mocker):
|
||||
update_ytdlp = mocker.patch(
|
||||
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_ytdlp"
|
||||
)
|
||||
update_ytdlp.return_value = False
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def get_lazy_module():
|
||||
def _get_lazy_module(module_name):
|
||||
return ModuleFactory().get_module_lazy(module_name)
|
||||
|
||||
return _get_lazy_module
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -134,6 +151,7 @@ def unpickle():
|
|||
|
||||
@pytest.fixture
|
||||
def mock_binary_dependencies(mocker):
|
||||
mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0)
|
||||
mock_shutil_which = mocker.patch("shutil.which")
|
||||
# Mock all binary dependencies as available
|
||||
mock_shutil_which.return_value = "/usr/bin/fake_binary"
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
from auto_archiver.core import Extractor
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ExampleExtractor(Extractor):
|
||||
def download(self, item):
|
||||
print("download")
|
||||
logger.info("download")
|
||||
|
||||
def cleanup(self):
|
||||
logger.info("cleanup")
|
||||
|
|
|
@ -1,27 +1,29 @@
|
|||
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
||||
def download(self, item):
|
||||
print("download")
|
||||
logger.info("download")
|
||||
|
||||
def __iter__(self):
|
||||
yield Metadata().set_url("https://example.com")
|
||||
|
||||
def done(self, result):
|
||||
print("done")
|
||||
logger.info("done")
|
||||
|
||||
def enrich(self, to_enrich):
|
||||
print("enrich")
|
||||
logger.info("enrich")
|
||||
|
||||
def get_cdn_url(self, media):
|
||||
return "nice_url"
|
||||
|
||||
def save(self, item):
|
||||
print("save")
|
||||
logger.info("save")
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
print("uploadf")
|
||||
logger.info("uploadf")
|
||||
|
||||
def format(self, item):
|
||||
print("format")
|
||||
logger.info("format")
|
||||
|
|
|
@ -37,7 +37,7 @@ class TestGenericExtractor(TestExtractorBase):
|
|||
package = "auto_archiver.modules.generic_extractor"
|
||||
assert self.extractor.dropin_for_name("bluesky", package=package)
|
||||
|
||||
# test loading dropings via filepath
|
||||
# test loading dropins via filepath
|
||||
path = os.path.join(dirname(dirname(__file__)), "data/")
|
||||
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
|
||||
|
||||
|
@ -122,7 +122,7 @@ class TestGenericExtractor(TestExtractorBase):
|
|||
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
)
|
||||
assert len(result.media) == 2
|
||||
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
|
||||
assert "J---aiyznGQ" in Path(result.media[0].filename).name
|
||||
assert Path(result.media[1].filename).name == "hqdefault.jpg"
|
||||
|
||||
@pytest.mark.download
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
import os
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_client_setup(mocker):
|
||||
mocker.patch("telethon.client.auth.AuthMethods.start")
|
||||
|
||||
|
||||
def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker):
|
||||
start = mocker.patch("telethon.client.auth.AuthMethods.start")
|
||||
start.side_effect = Exception("Test exception")
|
||||
|
||||
# make sure the default setup file is created
|
||||
session_file = tmp_path / "test.session"
|
||||
|
||||
lazy_module = get_lazy_module("telethon_extractor")
|
||||
|
||||
with pytest.raises(Exception):
|
||||
lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}})
|
||||
|
||||
assert session_file.exists()
|
||||
assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file
|
||||
assert os.path.exists(lazy_module._instance.session_file + ".session")
|
|
@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog):
|
|||
with pytest.raises(SetupError) as err:
|
||||
orchestrator.setup(args)
|
||||
assert "Module 'example_extractor' is not a feeder" in str(err.value)
|
||||
|
||||
|
||||
def test_load_failed_extractor_cleanup(test_args, mocker, caplog):
|
||||
orchestrator = ArchivingOrchestrator()
|
||||
|
||||
# hack to set up the paths so we can patch properly
|
||||
orchestrator.module_factory.setup_paths([TEST_MODULES])
|
||||
|
||||
# patch example_module.setup to throw an exception
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup",
|
||||
side_effect=Exception("Test exception"),
|
||||
)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
orchestrator.setup(test_args + ["--extractors", "example_extractor"])
|
||||
|
||||
assert "Error during setup of modules: Test exception" in caplog.text
|
||||
# make sure the 'cleanup' is called
|
||||
assert "cleanup" in caplog.text
|
||||
|
|
Ładowanie…
Reference in New Issue