Merge branch 'main' into feat/yt-dlp-pots

pull/222/head
erinhmclark 2025-03-28 10:42:24 +00:00
commit b5dc1854a2
13 zmienionych plików z 122 dodań i 38 usunięć

Wyświetl plik

@ -22,7 +22,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check out the repo
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
@ -33,14 +33,14 @@ jobs:
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@369eb591f429131d6889c46b94e711f089e6ca96
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
with:
images: bellingcat/auto-archiver

Wyświetl plik

@ -10,9 +10,6 @@ on:
workflows: ["Core Tests"]
types:
- completed
branches: [main]
paths:
- src/**
jobs:
tests:
@ -30,7 +27,10 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_branch || github.ref }}
# For PRs, use the head commit SHA from the triggering workflow
ref: ${{ github.event.workflow_run.head_sha || github.ref }}
# If PR is from a fork, we need fetch-depth: 0
fetch-depth: ${{ github.event.workflow_run.head_repository.fork && '0' || '1' }}
- name: Install poetry
run: pipx install poetry

Wyświetl plik

@ -214,7 +214,7 @@ class LazyBaseModule:
# check external dependencies are installed
def check_deps(deps, check):
for dep in filter(lambda d: len(d.strip()), deps):
for dep in filter(lambda d: len(d.strip()) > 0, deps):
if not check(dep.strip()):
logger.error(
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
@ -274,6 +274,9 @@ class LazyBaseModule:
# finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
# save the instance for future easy loading
self._instance = instance
# set the name, display name and module factory
instance.name = self.name
instance.display_name = self.display_name
@ -286,8 +289,6 @@ class LazyBaseModule:
instance.config_setup(config)
instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance
def __repr__(self):

Wyświetl plik

@ -387,8 +387,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
except (KeyboardInterrupt, Exception) as e:
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if loaded_module and module_type == "extractor":
loaded_module.cleanup()
# access the _instance here because loaded_module may not return if there's an error
if lazy_module._instance and module_type == "extractor":
lazy_module._instance.cleanup()
raise e
if not loaded_module:

Wyświetl plik

@ -4,12 +4,6 @@ import argparse
import json
def example_validator(value):
if "example" not in value:
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
return value
def positive_number(value):
if value < 0:
raise argparse.ArgumentTypeError(f"{value} is not a positive number")

Wyświetl plik

@ -19,7 +19,7 @@
},
"session_file": {
"default": "secrets/anon",
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
"help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.",
},
"join_channels": {
"default": True,

Wyświetl plik

@ -1,4 +1,10 @@
import os
import shutil
import re
import time
from pathlib import Path
from datetime import date
from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError
from telethon.tl.functions.messages import ImportChatInviteRequest
@ -8,11 +14,9 @@ from telethon.errors.rpcerrorlist import (
InviteRequestSentError,
InviteHashExpiredError,
)
from loguru import logger
from tqdm import tqdm
import re
import time
import os
from loguru import logger
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
@ -31,10 +35,22 @@ class TelethonExtractor(Extractor):
"""
logger.info(f"SETUP {self.name} checking login...")
# in case the user already added '.session' to the session_file
base_session_name = self.session_file.removesuffix(".session")
base_session_filepath = f"{base_session_name}.session"
if self.session_file and not os.path.exists(base_session_filepath):
logger.warning(
f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one."
)
Path(base_session_filepath).touch()
# make a copy of the session that is used exclusively with this archiver instance
new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
shutil.copy(self.session_file + ".session", new_session_file)
self.session_file = new_session_file.replace(".session", "")
self.session_file = os.path.join(
os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}"
)
logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session")
shutil.copy(base_session_filepath, f"{self.session_file}.session")
# initiate the client
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
@ -87,8 +103,8 @@ class TelethonExtractor(Extractor):
pbar.update()
def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.")
session_file_name = self.session_file + ".session"
logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session")
session_file_name = f"{self.session_file}.session"
if os.path.exists(session_file_name):
os.remove(session_file_name)

Wyświetl plik

@ -17,7 +17,24 @@ from auto_archiver.core.module import ModuleFactory
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
# what comes first will be run first (at the end of all other tests not mentioned)
# format is the name of the module (python file) without the .py extension
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
# don't check for ytdlp updates in tests
@pytest.fixture(autouse=True)
def skip_check_for_update(mocker):
update_ytdlp = mocker.patch(
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_ytdlp"
)
update_ytdlp.return_value = False
@pytest.fixture
def get_lazy_module():
def _get_lazy_module(module_name):
return ModuleFactory().get_module_lazy(module_name)
return _get_lazy_module
@pytest.fixture
@ -134,6 +151,7 @@ def unpickle():
@pytest.fixture
def mock_binary_dependencies(mocker):
mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0)
mock_shutil_which = mocker.patch("shutil.which")
# Mock all binary dependencies as available
mock_shutil_which.return_value = "/usr/bin/fake_binary"

Wyświetl plik

@ -1,6 +1,11 @@
from auto_archiver.core import Extractor
from loguru import logger
class ExampleExtractor(Extractor):
def download(self, item):
print("download")
logger.info("download")
def cleanup(self):
logger.info("cleanup")

Wyświetl plik

@ -1,27 +1,29 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
from loguru import logger
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def download(self, item):
print("download")
logger.info("download")
def __iter__(self):
yield Metadata().set_url("https://example.com")
def done(self, result):
print("done")
logger.info("done")
def enrich(self, to_enrich):
print("enrich")
logger.info("enrich")
def get_cdn_url(self, media):
return "nice_url"
def save(self, item):
print("save")
logger.info("save")
def uploadf(self, file, key, **kwargs):
print("uploadf")
logger.info("uploadf")
def format(self, item):
print("format")
logger.info("format")

Wyświetl plik

@ -37,7 +37,7 @@ class TestGenericExtractor(TestExtractorBase):
package = "auto_archiver.modules.generic_extractor"
assert self.extractor.dropin_for_name("bluesky", package=package)
# test loading dropings via filepath
# test loading dropins via filepath
path = os.path.join(dirname(dirname(__file__)), "data/")
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
@ -122,7 +122,7 @@ class TestGenericExtractor(TestExtractorBase):
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
)
assert len(result.media) == 2
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
assert "J---aiyznGQ" in Path(result.media[0].filename).name
assert Path(result.media[1].filename).name == "hqdefault.jpg"
@pytest.mark.download

Wyświetl plik

@ -0,0 +1,26 @@
import os
from datetime import date
import pytest
@pytest.fixture(autouse=True)
def mock_client_setup(mocker):
mocker.patch("telethon.client.auth.AuthMethods.start")
def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker):
start = mocker.patch("telethon.client.auth.AuthMethods.start")
start.side_effect = Exception("Test exception")
# make sure the default setup file is created
session_file = tmp_path / "test.session"
lazy_module = get_lazy_module("telethon_extractor")
with pytest.raises(Exception):
lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}})
assert session_file.exists()
assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file
assert os.path.exists(lazy_module._instance.session_file + ".session")

Wyświetl plik

@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog):
with pytest.raises(SetupError) as err:
orchestrator.setup(args)
assert "Module 'example_extractor' is not a feeder" in str(err.value)
def test_load_failed_extractor_cleanup(test_args, mocker, caplog):
orchestrator = ArchivingOrchestrator()
# hack to set up the paths so we can patch properly
orchestrator.module_factory.setup_paths([TEST_MODULES])
# patch example_module.setup to throw an exception
mocker.patch(
"auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup",
side_effect=Exception("Test exception"),
)
with pytest.raises(Exception):
orchestrator.setup(test_args + ["--extractors", "example_extractor"])
assert "Error during setup of modules: Test exception" in caplog.text
# make sure the 'cleanup' is called
assert "cleanup" in caplog.text