Merge pull request #279 from bellingcat/telethon_tweaks

Fix calling extractor.cleanup (fixes telethon issue) + tidy up telethon extractor session file naming
pull/283/head
Patrick Robertson 2025-03-28 14:13:26 +04:00 zatwierdzone przez GitHub
commit 25f1f5dc93
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
11 zmienionych plików z 114 dodań i 30 usunięć

Wyświetl plik

@ -274,6 +274,9 @@ class LazyBaseModule:
# finally, get the class instance # finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
# save the instance for future easy loading
self._instance = instance
# set the name, display name and module factory # set the name, display name and module factory
instance.name = self.name instance.name = self.name
instance.display_name = self.display_name instance.display_name = self.display_name
@ -286,8 +289,6 @@ class LazyBaseModule:
instance.config_setup(config) instance.config_setup(config)
instance.setup() instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance return instance
def __repr__(self): def __repr__(self):

Wyświetl plik

@ -387,8 +387,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
except (KeyboardInterrupt, Exception) as e: except (KeyboardInterrupt, Exception) as e:
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError): if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if loaded_module and module_type == "extractor":
loaded_module.cleanup() # access the _instance here because loaded_module may not return if there's an error
if lazy_module._instance and module_type == "extractor":
lazy_module._instance.cleanup()
raise e raise e
if not loaded_module: if not loaded_module:

Wyświetl plik

@ -4,12 +4,6 @@ import argparse
import json import json
def example_validator(value):
if "example" not in value:
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
return value
def positive_number(value): def positive_number(value):
if value < 0: if value < 0:
raise argparse.ArgumentTypeError(f"{value} is not a positive number") raise argparse.ArgumentTypeError(f"{value} is not a positive number")

Wyświetl plik

@ -19,7 +19,7 @@
}, },
"session_file": { "session_file": {
"default": "secrets/anon", "default": "secrets/anon",
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.", "help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.",
}, },
"join_channels": { "join_channels": {
"default": True, "default": True,

Wyświetl plik

@ -1,4 +1,10 @@
import os
import shutil import shutil
import re
import time
from pathlib import Path
from datetime import date
from telethon.sync import TelegramClient from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError from telethon.errors import ChannelInvalidError
from telethon.tl.functions.messages import ImportChatInviteRequest from telethon.tl.functions.messages import ImportChatInviteRequest
@ -8,11 +14,9 @@ from telethon.errors.rpcerrorlist import (
InviteRequestSentError, InviteRequestSentError,
InviteHashExpiredError, InviteHashExpiredError,
) )
from loguru import logger
from tqdm import tqdm from tqdm import tqdm
import re from loguru import logger
import time
import os
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
@ -31,10 +35,22 @@ class TelethonExtractor(Extractor):
""" """
logger.info(f"SETUP {self.name} checking login...") logger.info(f"SETUP {self.name} checking login...")
# in case the user already added '.session' to the session_file
base_session_name = self.session_file.removesuffix(".session")
base_session_filepath = f"{base_session_name}.session"
if self.session_file and not os.path.exists(base_session_filepath):
logger.warning(
f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one."
)
Path(base_session_filepath).touch()
# make a copy of the session that is used exclusively with this archiver instance # make a copy of the session that is used exclusively with this archiver instance
new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session") self.session_file = os.path.join(
shutil.copy(self.session_file + ".session", new_session_file) os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}"
self.session_file = new_session_file.replace(".session", "") )
logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session")
shutil.copy(base_session_filepath, f"{self.session_file}.session")
# initiate the client # initiate the client
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
@ -87,8 +103,8 @@ class TelethonExtractor(Extractor):
pbar.update() pbar.update()
def cleanup(self) -> None: def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.") logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session")
session_file_name = self.session_file + ".session" session_file_name = f"{self.session_file}.session"
if os.path.exists(session_file_name): if os.path.exists(session_file_name):
os.remove(session_file_name) os.remove(session_file_name)

Wyświetl plik

@ -17,7 +17,24 @@ from auto_archiver.core.module import ModuleFactory
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important # that you only want to run if everything else succeeds (e.g. API calls). The order here is important
# what comes first will be run first (at the end of all other tests not mentioned) # what comes first will be run first (at the end of all other tests not mentioned)
# format is the name of the module (python file) without the .py extension # format is the name of the module (python file) without the .py extension
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"] TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
# don't check for ytdlp updates in tests
@pytest.fixture(autouse=True)
def skip_check_for_update(mocker):
update_ytdlp = mocker.patch(
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_ytdlp"
)
update_ytdlp.return_value = False
@pytest.fixture
def get_lazy_module():
def _get_lazy_module(module_name):
return ModuleFactory().get_module_lazy(module_name)
return _get_lazy_module
@pytest.fixture @pytest.fixture
@ -134,6 +151,7 @@ def unpickle():
@pytest.fixture @pytest.fixture
def mock_binary_dependencies(mocker): def mock_binary_dependencies(mocker):
mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0)
mock_shutil_which = mocker.patch("shutil.which") mock_shutil_which = mocker.patch("shutil.which")
# Mock all binary dependencies as available # Mock all binary dependencies as available
mock_shutil_which.return_value = "/usr/bin/fake_binary" mock_shutil_which.return_value = "/usr/bin/fake_binary"

Wyświetl plik

@ -1,6 +1,11 @@
from auto_archiver.core import Extractor from auto_archiver.core import Extractor
from loguru import logger
class ExampleExtractor(Extractor): class ExampleExtractor(Extractor):
def download(self, item): def download(self, item):
print("download") logger.info("download")
def cleanup(self):
logger.info("cleanup")

Wyświetl plik

@ -1,27 +1,29 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
from loguru import logger
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter): class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def download(self, item): def download(self, item):
print("download") logger.info("download")
def __iter__(self): def __iter__(self):
yield Metadata().set_url("https://example.com") yield Metadata().set_url("https://example.com")
def done(self, result): def done(self, result):
print("done") logger.info("done")
def enrich(self, to_enrich): def enrich(self, to_enrich):
print("enrich") logger.info("enrich")
def get_cdn_url(self, media): def get_cdn_url(self, media):
return "nice_url" return "nice_url"
def save(self, item): def save(self, item):
print("save") logger.info("save")
def uploadf(self, file, key, **kwargs): def uploadf(self, file, key, **kwargs):
print("uploadf") logger.info("uploadf")
def format(self, item): def format(self, item):
print("format") logger.info("format")

Wyświetl plik

@ -36,7 +36,7 @@ class TestGenericExtractor(TestExtractorBase):
package = "auto_archiver.modules.generic_extractor" package = "auto_archiver.modules.generic_extractor"
assert self.extractor.dropin_for_name("bluesky", package=package) assert self.extractor.dropin_for_name("bluesky", package=package)
# test loading dropings via filepath # test loading dropins via filepath
path = os.path.join(dirname(dirname(__file__)), "data/") path = os.path.join(dirname(dirname(__file__)), "data/")
assert self.extractor.dropin_for_name("dropin", additional_paths=[path]) assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
@ -121,7 +121,7 @@ class TestGenericExtractor(TestExtractorBase):
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/" == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
) )
assert len(result.media) == 2 assert len(result.media) == 2
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm" assert "J---aiyznGQ" in Path(result.media[0].filename).name
assert Path(result.media[1].filename).name == "hqdefault.jpg" assert Path(result.media[1].filename).name == "hqdefault.jpg"
@pytest.mark.download @pytest.mark.download

Wyświetl plik

@ -0,0 +1,26 @@
import os
from datetime import date
import pytest
@pytest.fixture(autouse=True)
def mock_client_setup(mocker):
mocker.patch("telethon.client.auth.AuthMethods.start")
def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker):
start = mocker.patch("telethon.client.auth.AuthMethods.start")
start.side_effect = Exception("Test exception")
# make sure the default setup file is created
session_file = tmp_path / "test.session"
lazy_module = get_lazy_module("telethon_extractor")
with pytest.raises(Exception):
lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}})
assert session_file.exists()
assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file
assert os.path.exists(lazy_module._instance.session_file + ".session")

Wyświetl plik

@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog):
with pytest.raises(SetupError) as err: with pytest.raises(SetupError) as err:
orchestrator.setup(args) orchestrator.setup(args)
assert "Module 'example_extractor' is not a feeder" in str(err.value) assert "Module 'example_extractor' is not a feeder" in str(err.value)
def test_load_failed_extractor_cleanup(test_args, mocker, caplog):
orchestrator = ArchivingOrchestrator()
# hack to set up the paths so we can patch properly
orchestrator.module_factory.setup_paths([TEST_MODULES])
# patch example_module.setup to throw an exception
mocker.patch(
"auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup",
side_effect=Exception("Test exception"),
)
with pytest.raises(Exception):
orchestrator.setup(test_args + ["--extractors", "example_extractor"])
assert "Error during setup of modules: Test exception" in caplog.text
# make sure the 'cleanup' is called
assert "cleanup" in caplog.text