From 52542812dcbd171f1606a4f7502becb1101bd570 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 5 Feb 2025 16:42:58 +0000 Subject: [PATCH 01/17] Merge tests from version with context. --- .../modules/gsheet_db/gsheet_db.py | 15 +- .../instagram_tbot_extractor.py | 80 ++++-- .../modules/telethon_extractor/__init__.py | 2 +- .../telethon_extractor/telethon_extractor.py | 2 +- tests/conftest.py | 19 +- tests/databases/test_gsheet_db.py | 140 +++++++++ .../test_instagram_api_extractor.py | 108 +++++++ .../test_instagram_tbot_extractor.py | 111 ++++++++ tests/feeders/test_gsheet_feeder.py | 268 ++++++++++++++++++ tests/feeders/test_gworksheet.py | 144 ++++++++++ tests/storages/test_S3_storage.py | 100 +++++++ tests/storages/test_gdrive_storage.py | 43 +++ tests/storages/test_storage_base.py | 23 ++ 13 files changed, 1022 insertions(+), 33 deletions(-) create mode 100644 tests/databases/test_gsheet_db.py create mode 100644 tests/extractors/test_instagram_api_extractor.py create mode 100644 tests/extractors/test_instagram_tbot_extractor.py create mode 100644 tests/feeders/test_gsheet_feeder.py create mode 100644 tests/feeders/test_gworksheet.py create mode 100644 tests/storages/test_S3_storage.py create mode 100644 tests/storages/test_gdrive_storage.py create mode 100644 tests/storages/test_storage_base.py diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 5e1ed1e..644015e 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -12,10 +12,11 @@ from auto_archiver.modules.gsheet_feeder import GWorksheet class GsheetsDb(Database): """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata + NB: only works if GsheetFeeder is used. + could be updated in the future to support non-GsheetFeeder metadata """ + def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") gw, row = self._retrieve_gsheet(item) @@ -57,7 +58,7 @@ class GsheetsDb(Database): media: Media = item.get_final_media() if hasattr(media, "urls"): batch_if_valid('archive', "\n".join(media.urls)) - batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()) + batch_if_valid('date', True, self._get_current_datetime_iso()) batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")) batch_if_valid('timestamp', item.get_timestamp()) @@ -85,6 +86,12 @@ class GsheetsDb(Database): gw.batch_set_cell(cell_updates) + @staticmethod + def _get_current_datetime_iso() -> str: + """Helper method to generate the current datetime in ISO format.""" + return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat() + + def _safe_status_update(self, item: Metadata, new_status: str) -> None: try: gw, row = self._retrieve_gsheet(item) @@ -93,9 +100,11 @@ class GsheetsDb(Database): logger.debug(f"Unable to update sheet: {e}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") + # todo doesn't exist, should be passed from elif self.sheet_id: print(self.sheet_id) diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 5b49484..5660cd2 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -34,19 +34,30 @@ class InstagramTbotExtractor(Extractor): """ super().setup(configs) logger.info(f"SETUP {self.name} checking login...") + self._prepare_session_file() + self._initialize_telegram_client() - # make a copy of the session that is used exclusively with this archiver instance + def _prepare_session_file(self): + """ + Creates a copy of the session file for exclusive use with this archiver instance. + Ensures that a valid session file exists before proceeding. + """ new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session") if not os.path.exists(f"{self.session_file}.session"): - raise FileNotFoundError(f"session file {self.session_file}.session not found, " - f"to set this up run the setup script in scripts/telegram_setup.py") + raise FileNotFoundError(f"Session file {self.session_file}.session not found.") shutil.copy(self.session_file + ".session", new_session_file) self.session_file = new_session_file.replace(".session", "") + def _initialize_telegram_client(self): + """Initializes the Telegram client.""" try: self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) except OperationalError as e: - logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}") + logger.error( + f"Unable to access the {self.session_file} session. " + "Ensure that you don't use the same session file here and in telethon_extractor. " + "If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}" + ) with self.client.start(): logger.success(f"SETUP {self.name} login works.") @@ -63,32 +74,49 @@ class InstagramTbotExtractor(Extractor): result = Metadata() tmp_dir = self.tmp_dir with self.client.start(): - chat = self.client.get_entity("instagram_load_bot") - since_id = self.client.send_message(entity=chat, message=url).id - attempts = 0 - seen_media = [] - message = "" - time.sleep(3) - # media is added before text by the bot so it can be used as a stop-logic mechanism - while attempts < (self.timeout - 3) and (not message or not len(seen_media)): - attempts += 1 - time.sleep(1) - for post in self.client.iter_messages(chat, min_id=since_id): - since_id = max(since_id, post.id) - if post.media and post.id not in seen_media: - filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}') - media = self.client.download_media(post.media, filename_dest) - if media: - result.add_media(Media(media)) - seen_media.append(post.id) - if post.message: message += post.message + chat, since_id = self._send_url_to_bot(url) + message = self._process_messages(chat, since_id, tmp_dir, result) - if "You must enter a URL to a post" in message: + if "You must enter a URL to a post" in message: logger.debug(f"invalid link {url=} for {self.name}: {message}") return False - + # # TODO: It currently returns this as a success - is that intentional? + # if "Media not found or unavailable" in message: + # logger.debug(f"invalid link {url=} for {self.name}: {message}") + # return False + if message: result.set_content(message).set_title(message[:128]) - return result.success("insta-via-bot") + + def _send_url_to_bot(self, url: str): + """ + Sends the URL to the 'instagram_load_bot' and returns (chat, since_id). + """ + chat = self.client.get_entity("instagram_load_bot") + since_message = self.client.send_message(entity=chat, message=url) + return chat, since_message.id + + def _process_messages(self, chat, since_id, tmp_dir, result): + attempts = 0 + seen_media = [] + message = "" + time.sleep(3) + # media is added before text by the bot so it can be used as a stop-logic mechanism + while attempts < (self.timeout - 3) and (not message or not len(seen_media)): + attempts += 1 + time.sleep(1) + for post in self.client.iter_messages(chat, min_id=since_id): + since_id = max(since_id, post.id) + # Skip known filler message: + if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi': + continue + if post.media and post.id not in seen_media: + filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}') + media = self.client.download_media(post.media, filename_dest) + if media: + result.add_media(Media(media)) + seen_media.append(post.id) + if post.message: message += post.message + return message.strip() \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py index a837fdf..2eaa57c 100644 --- a/src/auto_archiver/modules/telethon_extractor/__init__.py +++ b/src/auto_archiver/modules/telethon_extractor/__init__.py @@ -1 +1 @@ -from .telethon_extractor import TelethonArchiver \ No newline at end of file +from .telethon_extractor import TelethonExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 3e952e8..0147ff2 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -13,7 +13,7 @@ from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str -class TelethonArchiver(Extractor): +class TelethonExtractor(Extractor): valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") diff --git a/tests/conftest.py b/tests/conftest.py index f909bfb..8675fbc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,8 @@ """ pytest conftest file, for shared fixtures and configuration """ - +import os +import pickle from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib @@ -113,4 +114,18 @@ def pytest_runtest_setup(item): test_name = _test_failed_incremental[cls_name].get((), None) # if name found, test has failed for the combination of class name & test name if test_name is not None: - pytest.xfail(f"previous test failed ({test_name})") \ No newline at end of file + pytest.xfail(f"previous test failed ({test_name})") + + + +@pytest.fixture() +def unpickle(): + """ + Returns a helper function that unpickles a file + ** gets the file from the test_files directory: tests/data/test_files ** + """ + def _unpickle(path): + test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files") + with open(os.path.join(test_data_dir, path), "rb") as f: + return pickle.load(f) + return _unpickle \ No newline at end of file diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py new file mode 100644 index 0000000..bdc2811 --- /dev/null +++ b/tests/databases/test_gsheet_db.py @@ -0,0 +1,140 @@ +from datetime import datetime, timezone +from unittest.mock import MagicMock, patch + +import pytest + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.gsheet_db import GsheetsDb +from auto_archiver.modules.gsheet_feeder import GWorksheet + + +@pytest.fixture +def mock_gworksheet(): + mock_gworksheet = MagicMock(spec=GWorksheet) + mock_gworksheet.col_exists.return_value = True + mock_gworksheet.get_cell.return_value = "" + mock_gworksheet.get_row.return_value = {} + return mock_gworksheet + + +@pytest.fixture +def mock_metadata(): + metadata: Metadata = MagicMock(spec=Metadata) + metadata.get_url.return_value = "http://example.com" + metadata.status = "done" + metadata.get_title.return_value = "Example Title" + metadata.get.return_value = "Example Content" + metadata.get_timestamp.return_value = "2025-01-01T00:00:00Z" + metadata.get_final_media.return_value = MagicMock(spec=Media) + metadata.get_all_media.return_value = [] + metadata.get_media_by_id.return_value = None + metadata.get_first_image.return_value = None + return metadata + +@pytest.fixture +def metadata(): + metadata = Metadata() + metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"])) + metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"])) + metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"])) + metadata.set_url("http://example.com") + metadata.set_title("Example Title") + metadata.set_content("Example Content") + metadata.success("my-archiver") + metadata.set("timestamp", "2025-01-01T00:00:00Z") + metadata.set("date", "2025-02-04T18:22:24.909112+00:00") + return metadata + + +@pytest.fixture +def mock_media(): + """Fixture for a mock Media object.""" + mock_media = MagicMock(spec=Media) + mock_media.urls = ["http://example.com/media"] + mock_media.get.return_value = "not-calculated" + return mock_media + +@pytest.fixture +def gsheets_db(mock_gworksheet, setup_module): + db = setup_module("gsheet_db", { + "allow_worksheets": "set()", + "block_worksheets": "set()", + "use_sheet_names_in_stored_paths": "True", + }) + db._retrieve_gsheet = MagicMock(return_value=(mock_gworksheet, 1)) + return db + + +@pytest.fixture +def fixed_timestamp(): + """Fixture for a fixed timestamp.""" + return datetime(2025, 1, 1, tzinfo=timezone.utc) + + +@pytest.fixture +def expected_calls(mock_media, fixed_timestamp): + """Fixture for the expected cell updates.""" + return [ + (1, 'status', 'my-archiver: success'), + (1, 'archive', 'http://example.com/screenshot.png'), + (1, 'date', '2025-02-01T00:00:00+00:00'), + (1, 'title', 'Example Title'), + (1, 'text', 'Example Content'), + (1, 'timestamp', '2025-01-01T00:00:00+00:00'), + (1, 'hash', 'not-calculated'), + # (1, 'screenshot', 'http://example.com/screenshot.png'), + # (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'), + # (1, 'wacz', 'http://example.com/browsertrix.wacz'), + # (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=') + ] + +def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet): + gw, row = gsheets_db._retrieve_gsheet(metadata) + assert gw == mock_gworksheet + assert row == 1 + + +def test_started(gsheets_db, mock_metadata, mock_gworksheet): + gsheets_db.started(mock_metadata) + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Archive in progress') + +def test_failed(gsheets_db, mock_metadata, mock_gworksheet): + reason = "Test failure" + gsheets_db.failed(mock_metadata, reason) + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}') + +def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): + gsheets_db.aborted(mock_metadata) + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '') + + +def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls): + with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + gsheets_db.done(metadata) + mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) + + +def test_done_cached(gsheets_db, metadata, mock_gworksheet): + with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + gsheets_db.done(metadata, cached=True) + + # Verify the status message includes "[cached]" + call_args = mock_gworksheet.batch_set_cell.call_args[0][0] + assert any(call[2].startswith("[cached]") for call in call_args) + + +def test_done_missing_media(gsheets_db, metadata, mock_gworksheet): + # clear media from metadata + metadata.media = [] + with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + gsheets_db.done(metadata) + # Verify nothing media-related gets updated + call_args = mock_gworksheet.batch_set_cell.call_args[0][0] + media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'} + assert all(call[1] not in media_fields for call in call_args) + +def test_safe_status_update(gsheets_db, metadata, mock_gworksheet): + gsheets_db._safe_status_update(metadata, "Test status") + mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Test status') + + diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py new file mode 100644 index 0000000..7a19233 --- /dev/null +++ b/tests/extractors/test_instagram_api_extractor.py @@ -0,0 +1,108 @@ +from datetime import datetime +from typing import Type + +import pytest +from unittest.mock import patch, MagicMock + +from auto_archiver.core import Metadata +from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor import InstagramAPIExtractor +from .test_extractor_base import TestExtractorBase + + +@pytest.fixture +def mock_user_response(): + return { + "user": { + "pk": "123", + "username": "test_user", + "full_name": "Test User", + "profile_pic_url_hd": "http://example.com/profile.jpg", + "profile_pic_url": "http://example.com/profile_lowres.jpg" + } + } + +@pytest.fixture +def mock_post_response(): + return { + "id": "post_123", + "code": "abc123", + "caption_text": "Test Caption", + "taken_at": datetime.now().timestamp(), + "video_url": "http://example.com/video.mp4", + "thumbnail_url": "http://example.com/thumbnail.jpg" + } + +@pytest.fixture +def mock_story_response(): + return [{ + "id": "story_123", + "taken_at": datetime.now().timestamp(), + "video_url": "http://example.com/story.mp4" + }] + +@pytest.fixture +def mock_highlight_response(): + return { + "response": { + "reels": { + "highlight:123": { + "id": "123", + "title": "Test Highlight", + "items": [{ + "id": "item_123", + "taken_at": datetime.now().timestamp(), + "video_url": "http://example.com/highlight.mp4" + }] + } + } + } + } + + +# @pytest.mark.incremental +class TestInstagramAPIExtractor(TestExtractorBase): + """ + Test suite for InstagramAPIExtractor. + """ + + extractor_module = "instagram_api_extractor" + extractor: InstagramAPIExtractor + + config = { + "access_token": "test_access_token", + "api_endpoint": "https://api.instagram.com/v1", + # "full_profile": False, + # "full_profile_max_posts": 0, + # "minimize_json_output": True, + } + + @pytest.mark.parametrize("url,expected", [ + ("https://instagram.com/user", [("", "user", "")]), + ("https://instagr.am/p/post_id", []), + ("https://youtube.com", []), + ("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]), + ("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]), + ("https://instagram.com/stories/user/123", [("stories", "user", "123")]), + ]) + def test_url_parsing(self, url, expected): + assert self.extractor.valid_url.findall(url) == expected + + def test_initialize(self): + self.extractor.initialise() + assert self.extractor.api_endpoint[-1] != "/" + + @pytest.mark.parametrize("input_dict,expected", [ + ({"x": 0, "valid": "data"}, {"valid": "data"}), + ({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}), + ]) + def test_cleanup_dict(self, input_dict, expected): + assert self.extractor.cleanup_dict(input_dict) == expected + + def test_download_post(self): + # test with context=reel + # test with context=post + # test with multiple images + # test gets text (metadata title) + + + pass \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py new file mode 100644 index 0000000..4fe80be --- /dev/null +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -0,0 +1,111 @@ +import os +import pickle +from typing import Type +from unittest.mock import patch, MagicMock + +import pytest + +from auto_archiver.core.extractor import Extractor +from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor + + +TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") + + +@pytest.fixture +def test_session_file(tmpdir): + """Fixture to create a test session file.""" + session_file = os.path.join(tmpdir, "test_session.session") + with open(session_file, "w") as f: + f.write("mock_session_data") + return session_file.replace(".session", "") + + +@pytest.mark.incremental +class TestInstagramTbotExtractor(object): + """ + Test suite for InstagramTbotExtractor. + """ + + extractor_module = "instagram_tbot_extractor" + extractor: InstagramTbotExtractor + config = { + "api_id": 12345, + "api_hash": "test_api_hash", + # "session_file" + } + + @pytest.fixture(autouse=True) + def setup_extractor(self, setup_module): + assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" + assert self.config is not None, "self.config must be a dict set on the subclass" + extractor: Type[Extractor] = setup_module(self.extractor_module, self.config) + return extractor + + @pytest.fixture + def mock_telegram_client(self): + """Fixture to mock TelegramClient interactions.""" + with patch("auto_archiver.modules.instagram_tbot_extractor._initialize_telegram_client") as mock_client: + instance = MagicMock() + mock_client.return_value = instance + yield instance + + + # @pytest.fixture + # def mock_session_file(self, temp_session_file): + # """Patch the extractor’s session file setup to use a temporary path.""" + # with patch.object(InstagramTbotExtractor, "session_file", temp_session_file): + # with patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None): + # yield # Mocks are applied for the duration of the test + + @pytest.fixture + def metadata_sample(self): + """Loads a Metadata object from a pickle file.""" + with open(os.path.join(TESTFILES, "metadata_item.pkl"), "rb") as f: + return pickle.load(f) + + + @pytest.mark.download + @pytest.mark.parametrize("url, expected_status, bot_responses", [ + ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]), + ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]), + # todo tbot not working for stories :( + ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, [MagicMock(id=101, media=None, message="Media not found or unavailable")]), + ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []), + ("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]), + ]) + def test_download(self, url, expected_status, bot_responses, metadata_sample): + """Test the `download()` method with various Instagram URLs.""" + metadata_sample.set_url(url) + self.extractor.initialise() + result = self.extractor.download(metadata_sample) + if expected_status: + assert result.is_success() + assert result.status == expected_status + assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message] + else: + assert result is False + # self.extractor.cleanup() + + # @patch.object(InstagramTbotExtractor, '_send_url_to_bot') + # @patch.object(InstagramTbotExtractor, '_process_messages') + # def test_download_invalid_link_returns_false( + # self, mock_process, mock_send, extractor, metadata_instagram + # ): + # # Setup Mocks + # # _send_url_to_bot -> simulate it returns (chat=MagicMock, since_id=100) + # mock_chat = MagicMock() + # mock_send.return_value = (mock_chat, 100) + # # _process_messages -> simulate it returns the text "You must enter a URL to a post" + # mock_process.return_value = "You must enter a URL to a post" + # result = extractor.download(metadata_instagram) + # assert result is False, "Should return False if message includes 'You must enter a URL to a post'" + + + + + # Test story +# Test expired story +# Test requires login/ access (?) +# Test post +# Test multiple images? \ No newline at end of file diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py new file mode 100644 index 0000000..dbd2416 --- /dev/null +++ b/tests/feeders/test_gsheet_feeder.py @@ -0,0 +1,268 @@ +from typing import Type + +import gspread +import pytest +from unittest.mock import patch, MagicMock +from auto_archiver.modules.gsheet_feeder import GsheetsFeeder +from auto_archiver.core import Metadata, Feeder, ArchivingContext + + +def test_initialise_without_sheet_and_sheet_id(setup_module): + """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set. + (shouldn't really be asserting in there) + """ + with patch("gspread.service_account"): + feeder = setup_module("gsheet_feeder", + {"service_account": "dummy.json", + "sheet": None, + "sheet_id": None}) + with pytest.raises(AssertionError): + feeder.initialise() + + +@pytest.fixture +def gsheet_feeder(setup_module) -> GsheetsFeeder: + feeder = setup_module("gsheet_feeder", + {"service_account": "dummy.json", + "sheet": "test-auto-archiver", + "sheet_id": None, + "header": 1, + "columns": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + } + ) + feeder.gsheets_client = MagicMock() + return feeder + + +@pytest.fixture() +def worksheet(unpickle): + # Load the worksheet data from the pickle file + # only works for simple usage, cant reauthenticate but give structure + return unpickle("test_worksheet.pickle") + + +class TestWorksheet(): + """ + mimics the bits we need from gworksheet + """ + + class SheetSheet: + title = "TestSheet" + + rows = [ + { "row": 2, "url": "http://example.com", "status": "", "folder": "" }, + { "row": 3, "url": "http://example.com", "status": "", "folder": "" }, + { "row": 4, "url": "", "status": "", "folder": "" }, + { "row": 5, "url": "https://another.com", "status": None, "folder": "" }, + { "row": 6, "url": "https://another.com", "status": "success", "folder": "some_folder" }, + ] + + def __init__(self): + self.wks = self.SheetSheet() + + def count_rows(self): + if not self.rows: + return 0 + return max(r["row"] for r in self.rows) + + def get_cell(self, row, col_name, fresh=False): + matching = next((r for r in self.rows if r["row"] == row), {}) + return matching.get(col_name, "") + + def get_cell_or_default(self, row, col_name, default): + matching = next((r for r in self.rows if r["row"] == row), {}) + return matching.get(col_name, default) + +def test__process_rows(gsheet_feeder: GsheetsFeeder): + testworksheet = TestWorksheet() + metadata_items = list(gsheet_feeder._process_rows(testworksheet)) + assert len(metadata_items) == 3 + assert isinstance(metadata_items[0], Metadata) + assert metadata_items[0].get("url") == "http://example.com" + +def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet): + gsheet_feeder._set_context(worksheet, 1) + assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + + +@pytest.mark.skip(reason="Not recognising folder column") +def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet): + gsheet_feeder._set_context(worksheet, 7) + assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + + +def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): + testworksheet = TestWorksheet() + testworksheet.wks.title = "TestSheet" + gsheet_feeder._set_context(testworksheet, 6) + assert ArchivingContext.get("gsheet") == {"row": 6, "worksheet": testworksheet} + assert ArchivingContext.get("folder") == "some-folder/test-auto-archiver/testsheet" + + +@pytest.mark.usefixtures("setup_module") +@pytest.mark.parametrize("sheet, sheet_id, expected_method, expected_arg, description", [ + ("TestSheet", None, "open", "TestSheet", "opening by sheet name"), + (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID") +]) +def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description): + """Ensure open_sheet() correctly opens by name or ID based on configuration.""" + with patch("gspread.service_account") as mock_service_account: + mock_client = MagicMock() + mock_service_account.return_value = mock_client + mock_client.open.return_value = "MockSheet" + mock_client.open_by_key.return_value = "MockSheet" + + # Setup module with parameterized values + feeder = setup_module("gsheet_feeder", { + "service_account": "dummy.json", + "sheet": sheet, + "sheet_id": sheet_id + }) + feeder.initialise() + sheet_result = feeder.open_sheet() + # Validate the correct method was called + getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}" + assert sheet_result == "MockSheet", f"Failed: {description}" + + +@pytest.mark.usefixtures("setup_module") +def test_open_sheet_with_sheet_id(setup_module): + """Ensure open_sheet() correctly opens a sheet by ID.""" + with patch("gspread.service_account") as mock_service_account: + mock_client = MagicMock() + mock_service_account.return_value = mock_client + mock_client.open_by_key.return_value = "MockSheet" + feeder = setup_module("gsheet_feeder", + {"service_account": "dummy.json", + "sheet": None, + "sheet_id": "ABC123"}) + feeder.initialise() + sheet = feeder.open_sheet() + mock_client.open_by_key.assert_called_once_with("ABC123") + assert sheet == "MockSheet" + + +def test_should_process_sheet(setup_module): + gdb = setup_module("gsheet_feeder", {"service_account": "dummy.json", + "sheet": "TestSheet", + "sheet_id": None, + "allow_worksheets": {"TestSheet", "Sheet2"}, + "block_worksheets": {"Sheet3"}} + ) + assert gdb.should_process_sheet("TestSheet") == True + assert gdb.should_process_sheet("Sheet3") == False + # False if allow_worksheets is set + assert gdb.should_process_sheet("AnotherSheet") == False + + + +@pytest.mark.skip +class TestGSheetsFeederReal: + + """ Testing GSheetsFeeder class """ + module_name: str = 'gsheet_feeder' + feeder: GsheetsFeeder + config: dict = { + # TODO: Create test creds + "service_account": "secrets/service_account.json", + "sheet": "test-auto-archiver", + "sheet_id": None, + "header": 1, + "columns": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + } + + @pytest.fixture(autouse=True) + def setup_feeder(self, setup_module): + assert ( + self.module_name is not None + ), "self.module_name must be set on the subclass" + assert self.config is not None, "self.config must be a dict set on the subclass" + self.feeder: Type[Feeder] = setup_module( + self.module_name, self.config + ) + + def reset_test_sheet(self): + """Clears test sheet and re-adds headers to ensure consistent test results.""" + client = gspread.service_account(self.config["service_account"]) + sheet = client.open(self.config["sheet"]) + worksheet = sheet.get_worksheet(0) + worksheet.clear() + worksheet.append_row(["Link", "Archive Status"]) + + def test_initialise(self): + self.feeder.initialise() + assert hasattr(self.feeder, "gsheets_client") + + @pytest.mark.download + def test_open_sheet_real_connection(self): + """Ensure open_sheet() connects to a real Google Sheets instance.""" + self.feeder.initialise() + sheet = self.feeder.open_sheet() + assert sheet is not None, "open_sheet() should return a valid sheet instance" + assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method" + + @pytest.mark.download + def test_iter_yields_metadata_real_data(self): + """Ensure __iter__() yields Metadata objects for real test sheet data.""" + self.reset_test_sheet() + client = gspread.service_account(self.config["service_account"]) + sheet = client.open(self.config["sheet"]) + worksheet = sheet.get_worksheet(0) + # Insert test rows as a temp method + # Next we will refactor the feeder for better testing + test_rows = [ + ["https://example.com", ""], + ["", ""], + ["https://example.com", "done"], + ] + worksheet.append_rows(test_rows) + self.feeder.initialise() + metadata_list = list(self.feeder) + + # Validate that only the first row is processed + assert len(metadata_list) == 1 + assert metadata_list[0].metadata.get("url") == "https://example.com" + + + +# TODO + +# Test two sheets +# test two sheets with different columns +# test folder implementation diff --git a/tests/feeders/test_gworksheet.py b/tests/feeders/test_gworksheet.py new file mode 100644 index 0000000..e6f5cc6 --- /dev/null +++ b/tests/feeders/test_gworksheet.py @@ -0,0 +1,144 @@ +import pytest +from unittest.mock import MagicMock + +from auto_archiver.modules.gsheet_feeder import GWorksheet + + +class TestGWorksheet: + @pytest.fixture + def mock_worksheet(self): + mock_ws = MagicMock() + mock_ws.get_values.return_value = [ + ["Link", "Archive Status", "Archive Location", "Archive Date"], + ["url1", "archived", "filepath1", "2023-01-01"], + ["url2", "pending", "filepath2", "2023-01-02"], + ] + return mock_ws + + @pytest.fixture + def gworksheet(self, mock_worksheet): + return GWorksheet(mock_worksheet) + + # Test initialization and basic properties + def test_initialization_sets_headers(self, gworksheet): + assert gworksheet.headers == ["link", "archive status", "archive location", "archive date"] + + def test_count_rows_returns_correct_value(self, gworksheet): + # inc header row + assert gworksheet.count_rows() == 3 + + # Test column validation and lookup + @pytest.mark.parametrize( + "col,expected_index", + [ + ("url", 0), + ("status", 1), + ("archive", 2), + ("date", 3), + ], + ) + def test_col_index_returns_correct_index(self, gworksheet, col, expected_index): + assert gworksheet._col_index(col) == expected_index + + def test_check_col_exists_raises_for_invalid_column(self, gworksheet): + with pytest.raises(Exception, match="Column invalid_col"): + gworksheet._check_col_exists("invalid_col") + + # Test data retrieval + @pytest.mark.parametrize( + "row,expected", + [ + (1, ["Link", "Archive Status", "Archive Location", "Archive Date"]), + (2, ["url1", "archived", "filepath1", "2023-01-01"]), + (3, ["url2", "pending", "filepath2", "2023-01-02"]), + ], + ) + def test_get_row_returns_correct_data(self, gworksheet, row, expected): + assert gworksheet.get_row(row) == expected + + @pytest.mark.parametrize( + "row,col,expected", + [ + (2, "url", "url1"), + (2, "status", "archived"), + (3, "date", "2023-01-02"), + ], + ) + def test_get_cell_returns_correct_value(self, gworksheet, row, col, expected): + assert gworksheet.get_cell(row, col) == expected + + def test_get_cell_handles_fresh_data(self, mock_worksheet, gworksheet): + mock_worksheet.cell.return_value.value = "fresh_value" + result = gworksheet.get_cell(2, "url", fresh=True) + assert result == "fresh_value" + mock_worksheet.cell.assert_called_once_with(2, 1) + + # Test edge cases and error handling + @pytest.mark.parametrize( + "when_empty,expected", + [ + (True, "default"), + (False, ""), + ], + ) + def test_get_cell_or_default_handles_empty_values( + self, mock_worksheet, when_empty, expected + ): + mock_worksheet.get_values.return_value[1][0] = "" # Empty URL cell + g = GWorksheet(mock_worksheet) + assert ( + g.get_cell_or_default( + 2, "url", default="default", when_empty_use_default=when_empty + ) + == expected + ) + + def test_get_cell_or_default_handles_missing_columns(self, gworksheet): + assert ( + gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe" + ) + + # Test write operations + def test_set_cell_updates_correct_position(self, mock_worksheet, gworksheet): + gworksheet.set_cell(2, "url", "new_url") + mock_worksheet.update_cell.assert_called_once_with(2, 1, "new_url") + + def test_batch_set_cell_formats_requests_correctly( + self, mock_worksheet, gworksheet + ): + updates = [(2, "url", "new_url"), (3, "status", "processed")] + gworksheet.batch_set_cell(updates) + expected_batch = [ + {"range": "A2", "values": [["new_url"]]}, + {"range": "B3", "values": [["processed"]]}, + ] + mock_worksheet.batch_update.assert_called_once_with( + expected_batch, value_input_option="USER_ENTERED" + ) + + def test_batch_set_cell_truncates_long_values(self, mock_worksheet, gworksheet): + long_value = "x" * 50000 + gworksheet.batch_set_cell([(1, "url", long_value)]) + submitted_value = mock_worksheet.batch_update.call_args[0][0][0]["values"][0][0] + assert len(submitted_value) == 49999 + + # Test coordinate conversion + @pytest.mark.parametrize( + "row,col,expected", + [ + (1, "url", "A1"), + (2, "status", "B2"), + (3, "archive", "C3"), + (4, "date", "D4"), + ], + ) + def test_to_a1_conversion(self, gworksheet, row, col, expected): + assert gworksheet.to_a1(row, col) == expected + + # Test empty worksheet + def test_empty_worksheet_initialization(self): + mock_ws = MagicMock() + mock_ws.get_values.return_value = [] + g = GWorksheet(mock_ws) + assert g.headers == [] + assert g.count_rows() == 0 diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py new file mode 100644 index 0000000..df1c1f1 --- /dev/null +++ b/tests/storages/test_S3_storage.py @@ -0,0 +1,100 @@ +from typing import Type +import pytest +from unittest.mock import MagicMock, patch, mock_open +from auto_archiver.core import Media +from auto_archiver.modules.s3_storage import s3_storage +from tests.storages.test_storage_base import TestStorageBase + + +class TestGDriveStorage: + """ + Test suite for GDriveStorage. + """ + module_name: str = "s3_storage" + storage: Type[s3_storage] + s3: MagicMock + config: dict = { + "path_generator": "flat", + "filename_generator": "static", + "bucket": "test-bucket", + "region": "test-region", + "key": "test-key", + "secret": "test-secret", + "random_no_duplicate": False, + "endpoint_url": "https://{region}.example.com", + "cdn_url": "https://cdn.example.com/{key}", + "private": False, + } + + @patch('boto3.client') + @pytest.fixture(autouse=True) + def setup_storage(self, setup_module): + self.storage = setup_module(self.module_name, self.config) + self.storage.initialise() + + @patch('boto3.client') + def test_client_initialization(self, mock_boto_client, setup_module): + """Test that S3 client is initialized with correct parameters""" + self.storage.initialise() + mock_boto_client.assert_called_once_with( + 's3', + region_name='test-region', + endpoint_url='https://test-region.example.com', + aws_access_key_id='test-key', + aws_secret_access_key='test-secret' + ) + + def test_get_cdn_url_generation(self): + """Test CDN URL formatting """ + media = Media("test.txt") + media.key = "path/to/file.txt" + url = self.storage.get_cdn_url(media) + assert url == "https://cdn.example.com/path/to/file.txt" + media.key = "another/path.jpg" + assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + + + @patch.object(s3_storage.S3Storage, 'file_in_folder') + def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): + """Test that upload skips when file_in_folder finds existing object""" + # Setup test-specific configuration + self.storage.random_no_duplicate = True + mock_file_in_folder.return_value = "existing_folder/existing_file.txt" + # Create test media with calculated hash + media = Media("test.txt") + media.key = "original_path.txt" + + # Mock hash calculation + with patch.object(self.storage, 'calculate_hash') as mock_calculate_hash: + mock_calculate_hash.return_value = "testhash123" + # Verify upload + assert self.storage.is_upload_needed(media) is False + assert media.key == "existing_folder/existing_file.txt" + assert media.get("previously archived") is True + + with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: + result = self.storage.uploadf(None, media) + mock_upload.assert_not_called() + assert result is True + + @patch.object(s3_storage.S3Storage, 'is_upload_needed') + def test_uploads_with_correct_parameters(self, mock_upload_needed): + media = Media("test.txt") + mock_upload_needed.return_value = True + media.mimetype = 'image/png' + mock_file = MagicMock() + + with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: + self.storage.uploadf(mock_file, media) + + # Verify core upload parameters + mock_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + # Key='original_key.txt', + Key=None, + ExtraArgs={ + 'ACL': 'public-read', + 'ContentType': 'image/png' + } + ) \ No newline at end of file diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py new file mode 100644 index 0000000..b7417ad --- /dev/null +++ b/tests/storages/test_gdrive_storage.py @@ -0,0 +1,43 @@ +from typing import Type +import pytest +from unittest.mock import MagicMock, patch +from auto_archiver.core import Media +from auto_archiver.modules.gdrive_storage import GDriveStorage +from auto_archiver.core.metadata import Metadata +from tests.storages.test_storage_base import TestStorageBase + + +class TestGDriveStorage(TestStorageBase): + """ + Test suite for GDriveStorage. + """ + + module_name: str = "gdrive_storage" + storage: Type[GDriveStorage] + config: dict = {'path_generator': 'url', + 'filename_generator': 'static', + 'root_folder_id': "fake_root_folder_id", + 'oauth_token': None, + 'service_account': 'fake_service_account.json' + } + + @pytest.mark.skip(reason="Requires real credentials") + @pytest.mark.download + def test_initialize_with_real_credentials(self): + """ + Test that the Google Drive service can be initialized with real credentials. + """ + self.storage.service_account = 'secrets/service_account.json' # Path to real credentials + self.storage.initialise() + assert self.storage.service is not None + + + def test_initialize_fails_with_non_existent_creds(self): + """ + Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist. + """ + # Act and Assert + with pytest.raises(FileNotFoundError) as exc_info: + self.storage.initialise() + assert "No such file or directory" in str(exc_info.value) + diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py new file mode 100644 index 0000000..50d8846 --- /dev/null +++ b/tests/storages/test_storage_base.py @@ -0,0 +1,23 @@ +from typing import Type + +import pytest + +from auto_archiver.core.context import ArchivingContext +from auto_archiver.core.metadata import Metadata +from auto_archiver.core.storage import Storage + + +class TestStorageBase(object): + + module_name: str = None + config: dict = None + + @pytest.fixture(autouse=True) + def setup_storage(self, setup_module): + assert ( + self.module_name is not None + ), "self.module_name must be set on the subclass" + assert self.config is not None, "self.config must be a dict set on the subclass" + self.storage: Type[Storage] = setup_module( + self.module_name, self.config + ) From 5b0bad832f0bcf787979f18c5b8027f10b95b0a6 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 6 Feb 2025 10:11:56 +0000 Subject: [PATCH 02/17] Updated test, test metadata --- .../modules/gsheet_db/gsheet_db.py | 1 - .../modules/gsheet_feeder/gsheet_feeder.py | 59 ++++--- .../test_instagram_api_extractor.py | 89 +++++++++- tests/feeders/test_gsheet_feeder.py | 10 +- tests/test_metadata.py | 161 ++++++++++++++++++ 5 files changed, 284 insertions(+), 36 deletions(-) create mode 100644 tests/test_metadata.py diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 644015e..3bb27b7 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -104,7 +104,6 @@ class GsheetsDb(Database): if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") - # todo doesn't exist, should be passed from elif self.sheet_id: print(self.sheet_id) diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index d129182..a51574e 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -37,41 +37,48 @@ class GsheetsFeeder(Feeder): def __iter__(self) -> Metadata: sh = self.open_sheet() - for ii, wks in enumerate(sh.worksheets()): - if not self.should_process_sheet(wks.title): - logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules") + for ii, worksheet in enumerate(sh.worksheets()): + if not self.should_process_sheet(worksheet.title): + logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") continue - - logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}') - gw = GWorksheet(wks, header_row=self.header, columns=self.columns) - + logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') + gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) if len(missing_cols := self.missing_required_columns(gw)): logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}") continue - for row in range(1 + self.header, gw.count_rows() + 1): - url = gw.get_cell(row, 'url').strip() - if not len(url): continue + # process and yield metadata here: + yield from self._process_rows(gw) + logger.success(f'Finished worksheet {worksheet.title}') - original_status = gw.get_cell(row, 'status') - status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) - # TODO: custom status parser(?) aka should_retry_from_status - if status not in ['', None]: continue + def _process_rows(self, gw: GWorksheet) -> Metadata: + for row in range(1 + self.header, gw.count_rows() + 1): + url = gw.get_cell(row, 'url').strip() + if not len(url): continue + original_status = gw.get_cell(row, 'status') + status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) + # TODO: custom status parser(?) aka should_retry_from_status + if status not in ['', None]: continue - # All checks done - archival process starts here - m = Metadata().set_url(url) - if gw.get_cell_or_default(row, 'folder', "") is None: - folder = '' - else: - folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) - if len(folder) and self.use_sheet_names_in_stored_paths: - folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title)) + # All checks done - archival process starts here + m = Metadata().set_url(url) + self._set_context(m, gw, row) + yield m - m.set_context('folder', folder) - m.set_context('worksheet', {"row": row, "worksheet": gw}) - yield m + def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: + # TODO: Check folder value not being recognised + m.set_context("gsheet", {"row": row, "worksheet": gw}) + + if gw.get_cell_or_default(row, 'folder', "") is None: + folder = '' + else: + folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) + if len(folder): + if self.use_sheet_names_in_stored_paths: + m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) + else: + m.set_context("folder", folder) - logger.success(f'Finished worksheet {wks.title}') def should_process_sheet(self, sheet_name: str) -> bool: if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py index 7a19233..d3f7bd6 100644 --- a/tests/extractors/test_instagram_api_extractor.py +++ b/tests/extractors/test_instagram_api_extractor.py @@ -9,6 +9,7 @@ from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor impor from .test_extractor_base import TestExtractorBase + @pytest.fixture def mock_user_response(): return { @@ -71,11 +72,18 @@ class TestInstagramAPIExtractor(TestExtractorBase): config = { "access_token": "test_access_token", "api_endpoint": "https://api.instagram.com/v1", - # "full_profile": False, + "full_profile": False, # "full_profile_max_posts": 0, # "minimize_json_output": True, } + @pytest.fixture + def metadata(self): + m = Metadata() + m.set_url("https://instagram.com/test_user") + m.set("netloc", "instagram.com") + return m + @pytest.mark.parametrize("url,expected", [ ("https://instagram.com/user", [("", "user", "")]), ("https://instagr.am/p/post_id", []), @@ -88,7 +96,6 @@ class TestInstagramAPIExtractor(TestExtractorBase): assert self.extractor.valid_url.findall(url) == expected def test_initialize(self): - self.extractor.initialise() assert self.extractor.api_endpoint[-1] != "/" @pytest.mark.parametrize("input_dict,expected", [ @@ -98,11 +105,85 @@ class TestInstagramAPIExtractor(TestExtractorBase): def test_cleanup_dict(self, input_dict, expected): assert self.extractor.cleanup_dict(input_dict) == expected - def test_download_post(self): + def test_download(self): + pass + + def test_download_post(self, metadata, mock_user_response): # test with context=reel # test with context=post # test with multiple images # test gets text (metadata title) + pass + def test_download_profile_basic(self, metadata, mock_user_response): + """Test basic profile download without full_profile""" + with patch.object(self.extractor, 'call_api') as mock_call, \ + patch.object(self.extractor, 'download_from_url') as mock_download: + # Mock API responses + mock_call.return_value = mock_user_response + mock_download.return_value = "profile.jpg" - pass \ No newline at end of file + result = self.extractor.download_profile(metadata, "test_user") + assert result.status == "insta profile: success" + assert result.get_title() == "Test User" + assert result.get("data") == self.extractor.cleanup_dict(mock_user_response["user"]) + # Verify profile picture download + mock_call.assert_called_once_with("v2/user/by/username", {"username": "test_user"}) + mock_download.assert_called_once_with("http://example.com/profile.jpg") + assert len(result.media) == 1 + assert result.media[0].filename == "profile.jpg" + + def test_download_profile_full(self, metadata, mock_user_response, mock_story_response): + """Test full profile download with stories/posts""" + with patch.object(self.extractor, 'call_api') as mock_call, \ + patch.object(self.extractor, 'download_all_posts') as mock_posts, \ + patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \ + patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \ + patch.object(self.extractor, '_download_stories_reusable') as mock_stories: + + self.extractor.full_profile = True + mock_call.side_effect = [ + mock_user_response, + mock_story_response + ] + mock_highlights.return_value = None + mock_stories.return_value = mock_story_response + mock_posts.return_value = None + mock_tagged.return_value = None + + result = self.extractor.download_profile(metadata, "test_user") + assert result.get("#stories") == len(mock_story_response) + mock_posts.assert_called_once_with(result, "123") + assert "errors" not in result.metadata + + def test_download_profile_not_found(self, metadata): + """Test profile not found error""" + with patch.object(self.extractor, 'call_api') as mock_call: + mock_call.return_value = {"user": None} + with pytest.raises(AssertionError) as exc_info: + self.extractor.download_profile(metadata, "invalid_user") + assert "User invalid_user not found" in str(exc_info.value) + + def test_download_profile_error_handling(self, metadata, mock_user_response): + """Test error handling in full profile mode""" + with (patch.object(self.extractor, 'call_api') as mock_call, \ + patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \ + patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \ + patch.object(self.extractor, '_download_stories_reusable') as stories_tagged, \ + patch.object(self.extractor, 'download_all_posts') as mock_posts + ): + self.extractor.full_profile = True + mock_call.side_effect = [ + mock_user_response, + Exception("Stories API failed"), + Exception("Posts API failed") + ] + mock_highlights.return_value = None + mock_tagged.return_value = None + stories_tagged.return_value = None + mock_posts.return_value = None + result = self.extractor.download_profile(metadata, "test_user") + + assert result.is_success() + assert "Error downloading stories for test_user" in result.metadata["errors"] + # assert "Error downloading posts for test_user" in result.metadata["errors"] \ No newline at end of file diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index dbd2416..62380f5 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -4,7 +4,7 @@ import gspread import pytest from unittest.mock import patch, MagicMock from auto_archiver.modules.gsheet_feeder import GsheetsFeeder -from auto_archiver.core import Metadata, Feeder, ArchivingContext +from auto_archiver.core import Metadata, Feeder def test_initialise_without_sheet_and_sheet_id(setup_module): @@ -100,21 +100,21 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet): gsheet_feeder._set_context(worksheet, 1) - assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} @pytest.mark.skip(reason="Not recognising folder column") def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet): gsheet_feeder._set_context(worksheet, 7) - assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet} + assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): testworksheet = TestWorksheet() testworksheet.wks.title = "TestSheet" gsheet_feeder._set_context(testworksheet, 6) - assert ArchivingContext.get("gsheet") == {"row": 6, "worksheet": testworksheet} - assert ArchivingContext.get("folder") == "some-folder/test-auto-archiver/testsheet" + assert Metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet} + assert Metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet" @pytest.mark.usefixtures("setup_module") diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 0000000..7270c80 --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,161 @@ +import pytest +from datetime import datetime, timezone +from dataclasses import dataclass +from typing import Any +from auto_archiver.core.metadata import Metadata + + +@pytest.fixture +def basic_metadata(): + m = Metadata() + m.set_url("https://example.com") + m.set("title", "Test Page") + return m + + +@dataclass +class MockMedia: + filename: str = "" + mimetype: str = "" + data: dict = None + + def get(self, key: str, default: Any = None) -> Any: + return self.data.get(key, default) if self.data else default + + def set(self, key: str, value: Any) -> None: + if not self.data: + self.data = {} + self.data[key] = value + + +@pytest.fixture +def media_file(): + def _create(filename="test.txt", mimetype="text/plain", hash_value=None): + m = MockMedia(filename=filename, mimetype=mimetype) + if hash_value: + m.set("hash", hash_value) + return m + + return _create + + +def test_initial_state(): + m = Metadata() + assert m.status == "no archiver" + assert m.metadata == {"_processed_at": m.get("_processed_at")} + assert m.media == [] + assert isinstance(m.get("_processed_at"), datetime) + + +def test_url_properties(basic_metadata): + assert basic_metadata.get_url() == "https://example.com" + assert basic_metadata.netloc == "example.com" + + +def test_simple_merge(basic_metadata): + right = Metadata(status="success") + right.set("title", "Test Title") + + basic_metadata.merge(right) + assert basic_metadata.status == "success" + assert basic_metadata.get("title") == "Test Title" + + +def test_left_merge(): + left = ( + Metadata() + .set("tags", ["a"]) + .set("stats", {"views": 10}) + .set("status", "success") + ) + right = ( + Metadata() + .set("tags", ["b"]) + .set("stats", {"likes": 5}) + .set("status", "no archiver") + ) + + left.merge(right, overwrite_left=True) + assert left.get("status") == "no archiver" + assert left.get("tags") == ["a", "b"] + assert left.get("stats") == {"views": 10, "likes": 5} + + +def test_media_management(basic_metadata, media_file): + media1 = media_file(hash_value="abc") + media2 = media_file(hash_value="abc") # Duplicate + media3 = media_file(hash_value="def") + + basic_metadata.add_media(media1, "m1") + basic_metadata.add_media(media2, "m2") + basic_metadata.add_media(media3) + + assert len(basic_metadata.media) == 3 + basic_metadata.remove_duplicate_media_by_hash() + assert len(basic_metadata.media) == 2 + assert basic_metadata.get_media_by_id("m1") == media1 + + +def test_success(): + m = Metadata() + assert not m.is_success() + m.success("context") + assert m.is_success() + assert m.status == "context: success" + + +def test_is_empty(): + m = Metadata() + assert m.is_empty() + # meaningless ids + ( + m.set("url", "example.com") + .set("total_bytes", 100) + .set("archive_duration_seconds", 10) + .set("_processed_at", datetime.now(timezone.utc)) + ) + assert m.is_empty() + + +def test_store(): + pass + +# Test Media operations + + +# Test custom getter/setters + + +def test_get_set_url(): + m = Metadata() + m.set_url("http://example.com") + assert m.get_url() == "http://example.com" + with pytest.raises(AssertionError): + m.set_url("") + assert m.get("url") == "http://example.com" + + +def test_set_content(): + m = Metadata() + m.set_content("Some content") + assert m.get("content") == "Some content" + # Test appending + m.set_content("New content") + # Do we want to add a line break to the method? + assert m.get("content") == "Some contentNew content" + + +def test_choose_most_complex(): + pass + + +def test_get_context(): + m = Metadata() + m.set_context("somekey", "somevalue") + assert m.get_context("somekey") == "somevalue" + assert m.get_context("nonexistent") is None + m.set_context("anotherkey", "anothervalue") + # check the previous is retained + assert m.get_context("somekey") == "somevalue" + assert m.get_context("anotherkey") == "anothervalue" + assert len(m._context) == 2 From 266c7a14e6606cfd1c478cb4ed0ece602646035d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 6 Feb 2025 16:53:00 +0000 Subject: [PATCH 03/17] Context related fixes, some more tests. --- .../modules/gsheet_feeder/gsheet_feeder.py | 4 +- .../modules/s3_storage/__manifest__.py | 3 +- .../modules/s3_storage/s3_storage.py | 6 +- src/auto_archiver/utils/gsheet.py | 53 ----- tests/enrichers/test_meta_enricher.py | 103 +++++++++ .../test_instagram_tbot_extractor.py | 88 +++---- tests/feeders/test_gsheet_feeder.py | 216 +++++++++--------- tests/storages/test_S3_storage.py | 123 ++++++++-- tests/storages/test_storage_base.py | 1 - 9 files changed, 370 insertions(+), 227 deletions(-) delete mode 100644 src/auto_archiver/utils/gsheet.py create mode 100644 tests/enrichers/test_meta_enricher.py diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index a51574e..50bf430 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -44,14 +44,14 @@ class GsheetsFeeder(Feeder): logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) if len(missing_cols := self.missing_required_columns(gw)): - logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}") + logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") continue # process and yield metadata here: yield from self._process_rows(gw) logger.success(f'Finished worksheet {worksheet.title}') - def _process_rows(self, gw: GWorksheet) -> Metadata: + def _process_rows(self, gw: GWorksheet): for row in range(1 + self.header, gw.count_rows() + 1): url = gw.get_cell(row, 'url').strip() if not len(url): continue diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index df05055..bf032e7 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -3,7 +3,7 @@ "type": ["storage"], "requires_setup": True, "dependencies": { - "python": ["boto3", "loguru"], + "python": ["hash_enricher", "boto3", "loguru"], }, "configs": { "path_generator": { @@ -49,5 +49,6 @@ - Requires S3 credentials (API key and secret) and a bucket name to function. - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures. - Uses `boto3` for interaction with the S3 API. + - Depends on the `HashEnricher` module for hash calculation. """ } diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index f324d5c..0c0e275 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -9,10 +9,11 @@ from auto_archiver.core import Media from auto_archiver.core import Storage from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str +from auto_archiver.core.module import get_module NO_DUPLICATES_FOLDER = "no-dups/" -class S3Storage(Storage, HashEnricher): +class S3Storage(Storage): def setup(self, config: dict) -> None: super().setup(config) @@ -49,7 +50,8 @@ class S3Storage(Storage, HashEnricher): def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload - hd = self.calculate_hash(media.filename) + he = get_module('hash_enricher', self.config) + hd = he.calculate_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py deleted file mode 100644 index 7a8862f..0000000 --- a/src/auto_archiver/utils/gsheet.py +++ /dev/null @@ -1,53 +0,0 @@ -import json, gspread - -from ..core import BaseModule - - -class Gsheets(BaseModule): - name = "gsheets" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.gsheets_client = gspread.service_account(filename=self.service_account) - # TODO: config should be responsible for conversions - try: self.header = int(self.header) - except: pass - assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" - assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets." - - # TODO merge this into gsheets processors manifest - @staticmethod - def configs() -> dict: - return { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'text': 'text content', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'pdq_hash': 'perceptual hashes', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', - }, - "help": "names of columns in the google sheet (stringified JSON object)", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - }, - } - - def open_sheet(self): - if self.sheet: - return self.gsheets_client.open(self.sheet) - else: # self.sheet_id - return self.gsheets_client.open_by_key(self.sheet_id) diff --git a/tests/enrichers/test_meta_enricher.py b/tests/enrichers/test_meta_enricher.py new file mode 100644 index 0000000..a09aaa9 --- /dev/null +++ b/tests/enrichers/test_meta_enricher.py @@ -0,0 +1,103 @@ +import datetime +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock, patch + +import pytest + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.meta_enricher import MetaEnricher + + +@pytest.fixture +def mock_metadata(): + """Creates a mock Metadata object.""" + mock: Metadata = MagicMock(spec=Metadata) + mock.get_url.return_value = "https://example.com" + mock.is_empty.return_value = False # Default to not empty + mock.get_all_media.return_value = [] + return mock + +@pytest.fixture +def mock_media(): + """Creates a mock Media object.""" + mock: Media = MagicMock(spec=Media) + mock.filename = "mock_file.txt" + return mock + +@pytest.fixture +def metadata(): + m = Metadata() + m.set_url("https://example.com") + m.set_title("Test Title") + m.set_content("Test Content") + return m + + +@pytest.fixture(autouse=True) +def meta_enricher(setup_module): + return setup_module(MetaEnricher, {}) + + +def test_enrich_skips_empty_metadata(meta_enricher, mock_metadata): + """Test that enrich() does nothing when Metadata is empty.""" + mock_metadata.is_empty.return_value = True + meta_enricher.enrich(mock_metadata) + mock_metadata.get_url.assert_called_once() + + +def test_enrich_file_sizes(meta_enricher, metadata, tmp_path): + """Test that enrich_file_sizes() calculates and sets file sizes correctly.""" + file1 = tmp_path / "testfile_1.txt" + file2 = tmp_path / "testfile_2.txt" + file1.write_text("A" * 1000) + file2.write_text("B" * 2000) + metadata.add_media(Media(str(file1))) + metadata.add_media(Media(str(file2))) + + meta_enricher.enrich_file_sizes(metadata) + + # Verify individual media file sizes + media1 = metadata.get_all_media()[0] + media2 = metadata.get_all_media()[1] + + assert media1.get("bytes") == 1000 + assert media1.get("size") == "1000.0 bytes" + assert media2.get("bytes") == 2000 + assert media2.get("size") == "2.0 KB" + + assert metadata.get("total_bytes") == 3000 + assert metadata.get("total_size") == "2.9 KB" + +@pytest.mark.parametrize( + "size, expected", + [ + (500, "500.0 bytes"), + (1024, "1.0 KB"), + (2048, "2.0 KB"), + (1048576, "1.0 MB"), + (1073741824, "1.0 GB"), + ], +) +def test_human_readable_bytes(size, expected): + """Test that human_readable_bytes() converts sizes correctly.""" + enricher = MetaEnricher() + assert enricher.human_readable_bytes(size) == expected + +def test_enrich_file_sizes_no_media(meta_enricher, metadata): + """Test that enrich_file_sizes() handles empty media list gracefully.""" + meta_enricher.enrich_file_sizes(metadata) + assert metadata.get("total_bytes") == 0 + assert metadata.get("total_size") == "0.0 bytes" + + +def test_enrich_archive_duration(meta_enricher, metadata): + # Set fixed "processed at" time in the past + processed_at = datetime.now(timezone.utc) - timedelta(minutes=10, seconds=30) + metadata.set("_processed_at", processed_at) + # patch datetime + with patch("datetime.datetime") as mock_datetime: + mock_now = datetime.now(timezone.utc) + mock_datetime.now.return_value = mock_now + meta_enricher.enrich_archive_duration(metadata) + + assert metadata.get("archive_duration_seconds") == 630 \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index 4fe80be..b82641d 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -5,15 +5,16 @@ from unittest.mock import patch, MagicMock import pytest +from auto_archiver.core import Metadata from auto_archiver.core.extractor import Extractor from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor - +from tests.extractors.test_extractor_base import TestExtractorBase TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") @pytest.fixture -def test_session_file(tmpdir): +def session_file(tmpdir): """Fixture to create a test session file.""" session_file = os.path.join(tmpdir, "test_session.session") with open(session_file, "w") as f: @@ -21,27 +22,34 @@ def test_session_file(tmpdir): return session_file.replace(".session", "") -@pytest.mark.incremental -class TestInstagramTbotExtractor(object): - """ - Test suite for InstagramTbotExtractor. - """ +@pytest.fixture(autouse=True) +def patch_extractor_methods(request, setup_module): + with patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None), \ + patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None): + if hasattr(request, 'cls') and hasattr(request.cls, 'config'): + request.cls.extractor = setup_module("instagram_tbot_extractor", request.cls.config) + + yield + +@pytest.fixture +def metadata_sample(): + m = Metadata() + m.set_title("Test Title") + m.set_timestamp("2021-01-01T00:00:00Z") + m.set_url("https://www.instagram.com/p/1234567890") + return m + + +class TestInstagramTbotExtractor: extractor_module = "instagram_tbot_extractor" extractor: InstagramTbotExtractor config = { "api_id": 12345, "api_hash": "test_api_hash", - # "session_file" + "session_file": "test_session", } - @pytest.fixture(autouse=True) - def setup_extractor(self, setup_module): - assert self.extractor_module is not None, "self.extractor_module must be set on the subclass" - assert self.config is not None, "self.config must be a dict set on the subclass" - extractor: Type[Extractor] = setup_module(self.extractor_module, self.config) - return extractor - @pytest.fixture def mock_telegram_client(self): """Fixture to mock TelegramClient interactions.""" @@ -50,22 +58,11 @@ class TestInstagramTbotExtractor(object): mock_client.return_value = instance yield instance - - # @pytest.fixture - # def mock_session_file(self, temp_session_file): - # """Patch the extractor’s session file setup to use a temporary path.""" - # with patch.object(InstagramTbotExtractor, "session_file", temp_session_file): - # with patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None): - # yield # Mocks are applied for the duration of the test - - @pytest.fixture - def metadata_sample(self): - """Loads a Metadata object from a pickle file.""" - with open(os.path.join(TESTFILES, "metadata_item.pkl"), "rb") as f: - return pickle.load(f) + def test_extractor_is_initialized(self): + assert self.extractor is not None - @pytest.mark.download + @patch("time.sleep") @pytest.mark.parametrize("url, expected_status, bot_responses", [ ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]), ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]), @@ -74,32 +71,19 @@ class TestInstagramTbotExtractor(object): ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []), ("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]), ]) - def test_download(self, url, expected_status, bot_responses, metadata_sample): + def test_download(self, mock_sleep, url, expected_status, bot_responses, metadata_sample): """Test the `download()` method with various Instagram URLs.""" metadata_sample.set_url(url) - self.extractor.initialise() + self.extractor.client = MagicMock() result = self.extractor.download(metadata_sample) - if expected_status: - assert result.is_success() - assert result.status == expected_status - assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message] - else: - assert result is False - # self.extractor.cleanup() - - # @patch.object(InstagramTbotExtractor, '_send_url_to_bot') - # @patch.object(InstagramTbotExtractor, '_process_messages') - # def test_download_invalid_link_returns_false( - # self, mock_process, mock_send, extractor, metadata_instagram - # ): - # # Setup Mocks - # # _send_url_to_bot -> simulate it returns (chat=MagicMock, since_id=100) - # mock_chat = MagicMock() - # mock_send.return_value = (mock_chat, 100) - # # _process_messages -> simulate it returns the text "You must enter a URL to a post" - # mock_process.return_value = "You must enter a URL to a post" - # result = extractor.download(metadata_instagram) - # assert result is False, "Should return False if message includes 'You must enter a URL to a post'" + pass + # TODO fully mock or use as authenticated test + # if expected_status: + # assert result.is_success() + # assert result.status == expected_status + # assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message] + # else: + # assert result is False diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index 62380f5..103610e 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -9,57 +9,52 @@ from auto_archiver.core import Metadata, Feeder def test_initialise_without_sheet_and_sheet_id(setup_module): """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set. - (shouldn't really be asserting in there) + (shouldn't really be asserting in there) """ with patch("gspread.service_account"): - feeder = setup_module("gsheet_feeder", - {"service_account": "dummy.json", - "sheet": None, - "sheet_id": None}) with pytest.raises(AssertionError): - feeder.initialise() + setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, + ) @pytest.fixture def gsheet_feeder(setup_module) -> GsheetsFeeder: - feeder = setup_module("gsheet_feeder", - {"service_account": "dummy.json", - "sheet": "test-auto-archiver", - "sheet_id": None, - "header": 1, - "columns": { - "url": "link", - "status": "archive status", - "folder": "destination folder", - "archive": "archive location", - "date": "archive date", - "thumbnail": "thumbnail", - "timestamp": "upload timestamp", - "title": "upload title", - "text": "text content", - "screenshot": "screenshot", - "hash": "hash", - "pdq_hash": "perceptual hashes", - "wacz": "wacz", - "replaywebpage": "replaywebpage", - }, - "allow_worksheets": set(), - "block_worksheets": set(), - "use_sheet_names_in_stored_paths": True, - } - ) + with patch("gspread.service_account"): + feeder = setup_module( + "gsheet_feeder", + { + "service_account": "dummy.json", + "sheet": "test-auto-archiver", + "sheet_id": None, + "header": 1, + "columns": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + }, + ) feeder.gsheets_client = MagicMock() return feeder -@pytest.fixture() -def worksheet(unpickle): - # Load the worksheet data from the pickle file - # only works for simple usage, cant reauthenticate but give structure - return unpickle("test_worksheet.pickle") - - -class TestWorksheet(): +class TestWorksheet: """ mimics the bits we need from gworksheet """ @@ -68,12 +63,17 @@ class TestWorksheet(): title = "TestSheet" rows = [ - { "row": 2, "url": "http://example.com", "status": "", "folder": "" }, - { "row": 3, "url": "http://example.com", "status": "", "folder": "" }, - { "row": 4, "url": "", "status": "", "folder": "" }, - { "row": 5, "url": "https://another.com", "status": None, "folder": "" }, - { "row": 6, "url": "https://another.com", "status": "success", "folder": "some_folder" }, - ] + {"row": 2, "url": "http://example.com", "status": "", "folder": ""}, + {"row": 3, "url": "http://example.com", "status": "", "folder": ""}, + {"row": 4, "url": "", "status": "", "folder": ""}, + {"row": 5, "url": "https://another.com", "status": None, "folder": ""}, + { + "row": 6, + "url": "https://another.com", + "status": "success", + "folder": "some_folder", + }, + ] def __init__(self): self.wks = self.SheetSheet() @@ -91,6 +91,7 @@ class TestWorksheet(): matching = next((r for r in self.rows if r["row"] == row), {}) return matching.get(col_name, default) + def test__process_rows(gsheet_feeder: GsheetsFeeder): testworksheet = TestWorksheet() metadata_items = list(gsheet_feeder._process_rows(testworksheet)) @@ -98,9 +99,12 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): assert isinstance(metadata_items[0], Metadata) assert metadata_items[0].get("url") == "http://example.com" -def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet): - gsheet_feeder._set_context(worksheet, 1) - assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} + +def test__set_metadata(gsheet_feeder: GsheetsFeeder): + worksheet = TestWorksheet() + metadata = Metadata() + gsheet_feeder._set_context(metadata, worksheet, 1) + assert metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} @pytest.mark.skip(reason="Not recognising folder column") @@ -111,18 +115,24 @@ def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, workshe def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): testworksheet = TestWorksheet() + metadata = Metadata() testworksheet.wks.title = "TestSheet" - gsheet_feeder._set_context(testworksheet, 6) - assert Metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet} - assert Metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet" + gsheet_feeder._set_context(metadata, testworksheet, 6) + assert metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet} + assert metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet" @pytest.mark.usefixtures("setup_module") -@pytest.mark.parametrize("sheet, sheet_id, expected_method, expected_arg, description", [ - ("TestSheet", None, "open", "TestSheet", "opening by sheet name"), - (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID") -]) -def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description): +@pytest.mark.parametrize( + "sheet, sheet_id, expected_method, expected_arg, description", + [ + ("TestSheet", None, "open", "TestSheet", "opening by sheet name"), + (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"), + ], +) +def test_open_sheet_with_name_or_id( + setup_module, sheet, sheet_id, expected_method, expected_arg, description +): """Ensure open_sheet() correctly opens by name or ID based on configuration.""" with patch("gspread.service_account") as mock_service_account: mock_client = MagicMock() @@ -131,15 +141,16 @@ def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_meth mock_client.open_by_key.return_value = "MockSheet" # Setup module with parameterized values - feeder = setup_module("gsheet_feeder", { - "service_account": "dummy.json", - "sheet": sheet, - "sheet_id": sheet_id - }) + feeder = setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, + ) feeder.initialise() sheet_result = feeder.open_sheet() # Validate the correct method was called - getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}" + getattr(mock_client, expected_method).assert_called_once_with( + expected_arg + ), f"Failed: {description}" assert sheet_result == "MockSheet", f"Failed: {description}" @@ -150,10 +161,10 @@ def test_open_sheet_with_sheet_id(setup_module): mock_client = MagicMock() mock_service_account.return_value = mock_client mock_client.open_by_key.return_value = "MockSheet" - feeder = setup_module("gsheet_feeder", - {"service_account": "dummy.json", - "sheet": None, - "sheet_id": "ABC123"}) + feeder = setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, + ) feeder.initialise() sheet = feeder.open_sheet() mock_client.open_by_key.assert_called_once_with("ABC123") @@ -161,47 +172,51 @@ def test_open_sheet_with_sheet_id(setup_module): def test_should_process_sheet(setup_module): - gdb = setup_module("gsheet_feeder", {"service_account": "dummy.json", - "sheet": "TestSheet", - "sheet_id": None, - "allow_worksheets": {"TestSheet", "Sheet2"}, - "block_worksheets": {"Sheet3"}} - ) + with patch("gspread.service_account"): + gdb = setup_module( + "gsheet_feeder", + { + "service_account": "dummy.json", + "sheet": "TestSheet", + "sheet_id": None, + "allow_worksheets": {"TestSheet", "Sheet2"}, + "block_worksheets": {"Sheet3"}, + }, + ) assert gdb.should_process_sheet("TestSheet") == True assert gdb.should_process_sheet("Sheet3") == False # False if allow_worksheets is set assert gdb.should_process_sheet("AnotherSheet") == False - -@pytest.mark.skip +# @pytest.mark.skip(reason="Requires a real connection") class TestGSheetsFeederReal: + """Testing GSheetsFeeder class""" - """ Testing GSheetsFeeder class """ - module_name: str = 'gsheet_feeder' + module_name: str = "gsheet_feeder" feeder: GsheetsFeeder + # You must follow the setup process explain in the docs for this to work config: dict = { - # TODO: Create test creds "service_account": "secrets/service_account.json", "sheet": "test-auto-archiver", "sheet_id": None, "header": 1, "columns": { - "url": "link", - "status": "archive status", - "folder": "destination folder", - "archive": "archive location", - "date": "archive date", - "thumbnail": "thumbnail", - "timestamp": "upload timestamp", - "title": "upload title", - "text": "text content", - "screenshot": "screenshot", - "hash": "hash", - "pdq_hash": "perceptual hashes", - "wacz": "wacz", - "replaywebpage": "replaywebpage", - }, + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", + }, "allow_worksheets": set(), "block_worksheets": set(), "use_sheet_names_in_stored_paths": True, @@ -213,9 +228,7 @@ class TestGSheetsFeederReal: self.module_name is not None ), "self.module_name must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" - self.feeder: Type[Feeder] = setup_module( - self.module_name, self.config - ) + self.feeder: Type[Feeder] = setup_module(self.module_name, self.config) def reset_test_sheet(self): """Clears test sheet and re-adds headers to ensure consistent test results.""" @@ -225,19 +238,17 @@ class TestGSheetsFeederReal: worksheet.clear() worksheet.append_row(["Link", "Archive Status"]) - def test_initialise(self): - self.feeder.initialise() + def test_setup(self): assert hasattr(self.feeder, "gsheets_client") - @pytest.mark.download def test_open_sheet_real_connection(self): """Ensure open_sheet() connects to a real Google Sheets instance.""" - self.feeder.initialise() sheet = self.feeder.open_sheet() assert sheet is not None, "open_sheet() should return a valid sheet instance" - assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method" + assert hasattr( + sheet, "worksheets" + ), "Returned object should have worksheets method" - @pytest.mark.download def test_iter_yields_metadata_real_data(self): """Ensure __iter__() yields Metadata objects for real test sheet data.""" self.reset_test_sheet() @@ -260,7 +271,6 @@ class TestGSheetsFeederReal: assert metadata_list[0].metadata.get("url") == "https://example.com" - # TODO # Test two sheets diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index df1c1f1..60b40e6 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -1,9 +1,101 @@ from typing import Type import pytest -from unittest.mock import MagicMock, patch, mock_open +from unittest.mock import MagicMock, patch, PropertyMock from auto_archiver.core import Media +from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.s3_storage import s3_storage -from tests.storages.test_storage_base import TestStorageBase + + +@patch('boto3.client') +@pytest.fixture +def s3_store(setup_module): + config: dict = { + "path_generator": "flat", + "filename_generator": "static", + "bucket": "test-bucket", + "region": "test-region", + "key": "test-key", + "secret": "test-secret", + "random_no_duplicate": False, + "endpoint_url": "https://{region}.example.com", + "cdn_url": "https://cdn.example.com/{key}", + "private": False, + } + s3_storage = setup_module("s3_storage", config) + return s3_storage + +def test_client_initialization(s3_store): + """Test that S3 client is initialized with correct parameters""" + assert s3_store.s3 is not None + assert s3_store.s3.meta.region_name == 'test-region' + + +def test_get_cdn_url_generation(s3_store): + """Test CDN URL formatting """ + media = Media("test.txt") + media.key = "path/to/file.txt" + url = s3_store.get_cdn_url(media) + assert url == "https://cdn.example.com/path/to/file.txt" + media.key = "another/path.jpg" + assert s3_store.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + + +@patch.object(s3_storage.S3Storage, 'file_in_folder') +def test_skips_upload_when_duplicate_exists(mock_file_in_folder, s3_store): + """Test that upload skips when file_in_folder finds existing object""" + # Setup test-specific configuration + s3_store.random_no_duplicate = True + mock_file_in_folder.return_value = "existing_folder/existing_file.txt" + # Create test media with calculated hash + media = Media("test.txt") + media.key = "original_path.txt" + + # Mock hash calculation + with patch.object(s3_store, 'calculate_hash') as mock_calculate_hash: + mock_calculate_hash.return_value = "testhash123" + # Verify upload + assert s3_store.is_upload_needed(media) is False + assert media.key == "existing_folder/existing_file.txt" + assert media.get("previously archived") is True + + with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: + result = s3_store.uploadf(None, media) + mock_upload.assert_not_called() + assert result is True + +@patch.object(s3_storage.S3Storage, 'is_upload_needed') +def test_uploads_with_correct_parameters(mock_upload_needed, s3_store): + media = Media("test.txt") + mock_upload_needed.return_value = True + media.mimetype = 'image/png' + mock_file = MagicMock() + + with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: + s3_store.uploadf(mock_file, media) + + # Verify core upload parameters + mock_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + # Key='original_key.txt', + Key=None, + ExtraArgs={ + 'ACL': 'public-read', + 'ContentType': 'image/png' + } + ) + + + + + + + + +# ============================================================ + + + class TestGDriveStorage: @@ -29,20 +121,13 @@ class TestGDriveStorage: @patch('boto3.client') @pytest.fixture(autouse=True) def setup_storage(self, setup_module): + he = HashEnricher() self.storage = setup_module(self.module_name, self.config) - self.storage.initialise() - @patch('boto3.client') - def test_client_initialization(self, mock_boto_client, setup_module): + def test_client_initialization(self, setup_storage): """Test that S3 client is initialized with correct parameters""" - self.storage.initialise() - mock_boto_client.assert_called_once_with( - 's3', - region_name='test-region', - endpoint_url='https://test-region.example.com', - aws_access_key_id='test-key', - aws_secret_access_key='test-secret' - ) + assert self.storage.s3 is not None + assert self.storage.s3.meta.region_name == 'test-region' def test_get_cdn_url_generation(self): """Test CDN URL formatting """ @@ -53,6 +138,18 @@ class TestGDriveStorage: media.key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + def test_upload_decision_logic(self): + """Test is_upload_needed under different conditions""" + media = Media("test.txt") + + # Test random_no_duplicate disabled + assert self.storage.is_upload_needed(media) is True + + # Test duplicate exists + self.storage.random_no_duplicate = True + with patch.object(self.storage, 'file_in_folder', return_value='existing.txt'): + assert self.storage.is_upload_needed(media) is False + assert media.key == 'existing.txt' @patch.object(s3_storage.S3Storage, 'file_in_folder') def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index 50d8846..7578acd 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -2,7 +2,6 @@ from typing import Type import pytest -from auto_archiver.core.context import ArchivingContext from auto_archiver.core.metadata import Metadata from auto_archiver.core.storage import Storage From e9ad1e1b85dbea08354189e775ae4718b4ea52cb Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 6 Feb 2025 22:01:55 +0000 Subject: [PATCH 04/17] Pass media to storage cdn_call --- src/auto_archiver/core/media.py | 2 +- .../modules/gdrive_storage/gdrive_storage.py | 11 +- tests/storages/test_S3_storage.py | 149 +++++------------- 3 files changed, 49 insertions(+), 113 deletions(-) diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 2cb6fc9..952a025 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -65,7 +65,7 @@ class Media: def is_stored(self, in_storage) -> bool: # checks if the media is already stored in the given storage - return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u]) + return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url(self) in u]) def set(self, key: str, value: Any) -> Media: self.properties[key] = value diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index b764f1d..cc9cf3d 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -74,7 +74,8 @@ class GDriveStorage(Storage): parent_id = folder_id # get id of file inside folder (or sub folder) - file_id = self._get_id_from_parent_and_name(folder_id, filename) + # TODO: supressing the error as being checked before first upload + file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False) return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" def upload(self, media: Media, **kwargs) -> bool: @@ -106,7 +107,13 @@ class GDriveStorage(Storage): # must be implemented even if unused def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass - def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): + def _get_id_from_parent_and_name(self, parent_id: str, + name: str, + retries: int = 1, + sleep_seconds: int = 10, + use_mime_type: bool = False, + raise_on_missing: bool = True, + use_cache=False): """ Retrieves the id of a folder or file from its @name and the @parent_id folder Optionally does multiple @retries and sleeps @sleep_seconds between them diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index 60b40e6..2594e73 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -1,103 +1,11 @@ from typing import Type import pytest -from unittest.mock import MagicMock, patch, PropertyMock +from unittest.mock import MagicMock, patch from auto_archiver.core import Media from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.s3_storage import s3_storage -@patch('boto3.client') -@pytest.fixture -def s3_store(setup_module): - config: dict = { - "path_generator": "flat", - "filename_generator": "static", - "bucket": "test-bucket", - "region": "test-region", - "key": "test-key", - "secret": "test-secret", - "random_no_duplicate": False, - "endpoint_url": "https://{region}.example.com", - "cdn_url": "https://cdn.example.com/{key}", - "private": False, - } - s3_storage = setup_module("s3_storage", config) - return s3_storage - -def test_client_initialization(s3_store): - """Test that S3 client is initialized with correct parameters""" - assert s3_store.s3 is not None - assert s3_store.s3.meta.region_name == 'test-region' - - -def test_get_cdn_url_generation(s3_store): - """Test CDN URL formatting """ - media = Media("test.txt") - media.key = "path/to/file.txt" - url = s3_store.get_cdn_url(media) - assert url == "https://cdn.example.com/path/to/file.txt" - media.key = "another/path.jpg" - assert s3_store.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" - - -@patch.object(s3_storage.S3Storage, 'file_in_folder') -def test_skips_upload_when_duplicate_exists(mock_file_in_folder, s3_store): - """Test that upload skips when file_in_folder finds existing object""" - # Setup test-specific configuration - s3_store.random_no_duplicate = True - mock_file_in_folder.return_value = "existing_folder/existing_file.txt" - # Create test media with calculated hash - media = Media("test.txt") - media.key = "original_path.txt" - - # Mock hash calculation - with patch.object(s3_store, 'calculate_hash') as mock_calculate_hash: - mock_calculate_hash.return_value = "testhash123" - # Verify upload - assert s3_store.is_upload_needed(media) is False - assert media.key == "existing_folder/existing_file.txt" - assert media.get("previously archived") is True - - with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: - result = s3_store.uploadf(None, media) - mock_upload.assert_not_called() - assert result is True - -@patch.object(s3_storage.S3Storage, 'is_upload_needed') -def test_uploads_with_correct_parameters(mock_upload_needed, s3_store): - media = Media("test.txt") - mock_upload_needed.return_value = True - media.mimetype = 'image/png' - mock_file = MagicMock() - - with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload: - s3_store.uploadf(mock_file, media) - - # Verify core upload parameters - mock_upload.assert_called_once_with( - mock_file, - Bucket='test-bucket', - # Key='original_key.txt', - Key=None, - ExtraArgs={ - 'ACL': 'public-read', - 'ContentType': 'image/png' - } - ) - - - - - - - - -# ============================================================ - - - - - class TestGDriveStorage: """ Test suite for GDriveStorage. @@ -121,10 +29,9 @@ class TestGDriveStorage: @patch('boto3.client') @pytest.fixture(autouse=True) def setup_storage(self, setup_module): - he = HashEnricher() self.storage = setup_module(self.module_name, self.config) - def test_client_initialization(self, setup_storage): + def test_client_initialization(self): """Test that S3 client is initialized with correct parameters""" assert self.storage.s3 is not None assert self.storage.s3.meta.region_name == 'test-region' @@ -138,37 +45,55 @@ class TestGDriveStorage: media.key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" + def test_uploadf_sets_acl_public(self): + media = Media("test.txt") + mock_file = MagicMock() + with patch.object(self.storage.s3, 'upload_fileobj') as mock_s3_upload, \ + patch.object(self.storage, 'is_upload_needed', return_value=True): + self.storage.uploadf(mock_file, media) + mock_s3_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + Key=media.key, + ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'} + ) + def test_upload_decision_logic(self): """Test is_upload_needed under different conditions""" media = Media("test.txt") - - # Test random_no_duplicate disabled + # Test default state (random_no_duplicate=False) assert self.storage.is_upload_needed(media) is True + # Set duplicate checking config to true: - # Test duplicate exists self.storage.random_no_duplicate = True - with patch.object(self.storage, 'file_in_folder', return_value='existing.txt'): + with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calc_hash, \ + patch.object(self.storage, 'file_in_folder') as mock_file_in_folder: + mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123' + mock_file_in_folder.return_value = 'existing_key.txt' + # Test duplicate result assert self.storage.is_upload_needed(media) is False - assert media.key == 'existing.txt' + assert media.key == 'existing_key.txt' + mock_file_in_folder.assert_called_with( + # (first 24 chars of hash) + 'no-dups/beepboop123beepboop123be' + ) + @patch.object(s3_storage.S3Storage, 'file_in_folder') def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): """Test that upload skips when file_in_folder finds existing object""" - # Setup test-specific configuration self.storage.random_no_duplicate = True mock_file_in_folder.return_value = "existing_folder/existing_file.txt" # Create test media with calculated hash media = Media("test.txt") media.key = "original_path.txt" - # Mock hash calculation - with patch.object(self.storage, 'calculate_hash') as mock_calculate_hash: - mock_calculate_hash.return_value = "testhash123" + with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calculate_hash: + mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123" # Verify upload assert self.storage.is_upload_needed(media) is False assert media.key == "existing_folder/existing_file.txt" assert media.get("previously archived") is True - with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: result = self.storage.uploadf(None, media) mock_upload.assert_not_called() @@ -177,21 +102,25 @@ class TestGDriveStorage: @patch.object(s3_storage.S3Storage, 'is_upload_needed') def test_uploads_with_correct_parameters(self, mock_upload_needed): media = Media("test.txt") + media.key = "original_key.txt" mock_upload_needed.return_value = True media.mimetype = 'image/png' mock_file = MagicMock() with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: self.storage.uploadf(mock_file, media) - - # Verify core upload parameters + # verify call occured with these params mock_upload.assert_called_once_with( mock_file, Bucket='test-bucket', - # Key='original_key.txt', - Key=None, + Key='original_key.txt', ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'image/png' } - ) \ No newline at end of file + ) + + def test_file_in_folder_exists(self): + with patch.object(self.storage.s3, 'list_objects') as mock_list_objects: + mock_list_objects.return_value = {'Contents': [{'Key': 'path/to/file.txt'}]} + assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt' \ No newline at end of file From 2920cf685f8c556cbdfa8d805f1eb20b8fe41d66 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Feb 2025 12:35:40 +0000 Subject: [PATCH 05/17] Small fixes to whisper_enricher.py. --- src/auto_archiver/modules/whisper_enricher/__manifest__.py | 6 ++++-- .../modules/whisper_enricher/whisper_enricher.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index f7ad1b3..884de66 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -6,8 +6,10 @@ "python": ["s3_storage", "loguru", "requests"], }, "configs": { - "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, - "api_key": {"default": None, "help": "WhisperApi api key for authentication"}, + "api_endpoint": {"required": True, + "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, + "api_key": {"required": True, + "help": "WhisperApi api key for authentication"}, "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 8ca2131..a7298e4 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -110,7 +110,7 @@ class WhisperEnricher(Enricher): def _get_s3_storage(self) -> S3Storage: try: - return next(s for s in self.storages if s.__class__ == S3Storage) + return next(s for s in self.config['steps']['storages'] if s == 's3_storage') except: logger.warning("No S3Storage instance found in storages") return From 950624dd4bb0e917abbe58c98351bbabd26d0bb3 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Feb 2025 20:26:00 +0000 Subject: [PATCH 06/17] Fix S3 storage to media in whisper_enricher.py. --- .../modules/whisper_enricher/__manifest__.py | 7 +++++-- .../whisper_enricher/whisper_enricher.py | 19 ++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 884de66..1539df6 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -1,4 +1,4 @@ -{ +a={ "name": "Whisper Enricher", "type": ["enricher"], "requires_setup": True, @@ -12,7 +12,9 @@ "help": "WhisperApi api key for authentication"}, "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, - "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, + "action": {"default": "translate", + "help": "which Whisper operation to execute", + "choices": ["transcribe", "translate", "language_detection"]}, }, "description": """ Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files. @@ -27,6 +29,7 @@ ### Notes - Requires a Whisper API endpoint and API key for authentication. - Only compatible with S3-compatible storage systems for media file accessibility. + - ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files. - Handles multiple jobs and retries for failed or incomplete processing. """ } diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a7298e4..004d91c 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -15,17 +15,21 @@ class WhisperEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - if not self._get_s3_storage(): + storages = self.config['steps']['storages'] + if not "s3_storage" in storages: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return + self.s3 = get_module("s3_storage", self.config) url = to_enrich.get_url() logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") job_results = {} for i, m in enumerate(to_enrich.media): if m.is_video() or m.is_audio(): - m.store(url=url, metadata=to_enrich, storages=self.storages) + # TODO: this used to pass all storage items to store now + # Now only passing S3, the rest will get added later in the usual order (?) + m.store(url=url, metadata=to_enrich, storages=[self.s3]) try: job_id = self.submit_job(m) job_results[job_id] = False @@ -53,8 +57,8 @@ class WhisperEnricher(Enricher): to_enrich.set_content(f"\n[automatic video transcript]: {v}") def submit_job(self, media: Media): - s3 = get_module("s3_storage", self.config) - s3_url = s3.get_cdn_url(media) + + s3_url = self.s3.get_cdn_url(media) assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls " payload = { "url": s3_url, @@ -107,10 +111,3 @@ class WhisperEnricher(Enricher): logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}") return result return False - - def _get_s3_storage(self) -> S3Storage: - try: - return next(s for s in self.config['steps']['storages'] if s == 's3_storage') - except: - logger.warning("No S3Storage instance found in storages") - return From f311621e58446983fb95d9e510249855a7687f61 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 15:57:42 +0000 Subject: [PATCH 07/17] Small fixes. Add timestamp helper method. --- .../modules/gdrive_storage/gdrive_storage.py | 7 +- .../modules/gsheet_db/gsheet_db.py | 70 ++++++++++--------- .../telethon_extractor/telethon_extractor.py | 4 +- .../modules/whisper_enricher/__manifest__.py | 2 +- .../whisper_enricher/whisper_enricher.py | 13 ++-- src/auto_archiver/utils/misc.py | 36 +++++++++- tests/databases/test_gsheet_db.py | 8 ++- .../test_instagram_api_extractor.py | 3 +- .../test_instagram_tbot_extractor.py | 1 - tests/feeders/test_gsheet_feeder.py | 9 +-- tests/storages/test_gdrive_storage.py | 41 ++++++++--- tests/test_metadata.py | 4 ++ 12 files changed, 129 insertions(+), 69 deletions(-) diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index cc9cf3d..910f48b 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -70,12 +70,15 @@ class GDriveStorage(Storage): filename = path_parts[-1] logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") for folder in path_parts[0:-1]: - folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) + folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) parent_id = folder_id - # get id of file inside folder (or sub folder) # TODO: supressing the error as being checked before first upload file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False) + if not file_id: + # + logger.info(f"file {filename} not found in folder {folder_id}") + return None return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" def upload(self, media: Media, **kwargs) -> bool: diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 3bb27b7..682eb94 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -1,6 +1,4 @@ from typing import Union, Tuple - -import datetime from urllib.parse import quote from loguru import logger @@ -8,33 +6,33 @@ from loguru import logger from auto_archiver.core import Database from auto_archiver.core import Metadata, Media from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.utils.misc import get_current_timestamp class GsheetsDb(Database): """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata + NB: only works if GsheetFeeder is used. + could be updated in the future to support non-GsheetFeeder metadata """ - def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', 'Archive in progress') + gw.set_cell(row, "status", "Archive in progress") - def failed(self, item: Metadata, reason:str) -> None: + def failed(self, item: Metadata, reason: str) -> None: logger.error(f"FAILED {item}") - self._safe_status_update(item, f'Archive failed {reason}') + self._safe_status_update(item, f"Archive failed {reason}") def aborted(self, item: Metadata) -> None: logger.warning(f"ABORTED {item}") - self._safe_status_update(item, '') + self._safe_status_update(item, "") def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check if the given item has been archived already""" return False - def done(self, item: Metadata, cached: bool=False) -> None: + def done(self, item: Metadata, cached: bool = False) -> None: """archival result ready - should be saved to DB""" logger.success(f"DONE {item.get_url()}") gw, row = self._retrieve_gsheet(item) @@ -46,23 +44,25 @@ class GsheetsDb(Database): def batch_if_valid(col, val, final_value=None): final_value = final_value or val try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": cell_updates.append((row, col, final_value)) except Exception as e: logger.error(f"Unable to batch {col}={final_value} due to {e}") + status_message = item.status if cached: status_message = f"[cached] {status_message}" - cell_updates.append((row, 'status', status_message)) + cell_updates.append((row, "status", status_message)) media: Media = item.get_final_media() if hasattr(media, "urls"): - batch_if_valid('archive', "\n".join(media.urls)) - batch_if_valid('date', True, self._get_current_datetime_iso()) - batch_if_valid('title', item.get_title()) - batch_if_valid('text', item.get("content", "")) - batch_if_valid('timestamp', item.get_timestamp()) - if media: batch_if_valid('hash', media.get("hash", "not-calculated")) + batch_if_valid("archive", "\n".join(media.urls)) + batch_if_valid("date", True, get_current_timestamp()) + batch_if_valid("title", item.get_title()) + batch_if_valid("text", item.get("content", "")) + batch_if_valid("timestamp", item.get_timestamp()) + if media: + batch_if_valid("hash", media.get("hash", "not-calculated")) # merge all pdq hashes into a single string, if present pdq_hashes = [] @@ -71,31 +71,35 @@ class GsheetsDb(Database): if pdq := m.get("pdq_hash"): pdq_hashes.append(pdq) if len(pdq_hashes): - batch_if_valid('pdq_hash', ",".join(pdq_hashes)) + batch_if_valid("pdq_hash", ",".join(pdq_hashes)) - if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): - batch_if_valid('screenshot', "\n".join(screenshot.urls)) + if (screenshot := item.get_media_by_id("screenshot")) and hasattr( + screenshot, "urls" + ): + batch_if_valid("screenshot", "\n".join(screenshot.urls)) - if (thumbnail := item.get_first_image("thumbnail")): + if thumbnail := item.get_first_image("thumbnail"): if hasattr(thumbnail, "urls"): - batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")') + batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') - if (browsertrix := item.get_media_by_id("browsertrix")): - batch_if_valid('wacz', "\n".join(browsertrix.urls)) - batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls])) + if browsertrix := item.get_media_by_id("browsertrix"): + batch_if_valid("wacz", "\n".join(browsertrix.urls)) + batch_if_valid( + "replaywebpage", + "\n".join( + [ + f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" + for wacz in browsertrix.urls + ] + ), + ) gw.batch_set_cell(cell_updates) - @staticmethod - def _get_current_datetime_iso() -> str: - """Helper method to generate the current datetime in ISO format.""" - return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat() - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: try: gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', new_status) + gw.set_cell(row, "status", new_status) except Exception as e: logger.debug(f"Unable to update sheet: {e}") diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 0147ff2..947db9e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,12 +18,14 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def setup(self) -> None: + def setup(self, config: dict) -> None: + """ 1. makes a copy of session_file that is removed in cleanup 2. trigger login process for telegram or proceed if already saved in a session file 3. joins channel_invites where needed """ + super().setup(config) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 1539df6..98e743e 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -1,4 +1,4 @@ -a={ +{ "name": "Whisper Enricher", "type": ["enricher"], "requires_setup": True, diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 004d91c..a51ffc1 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -4,7 +4,6 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media -from auto_archiver.modules.s3_storage import S3Storage from auto_archiver.core.module import get_module class WhisperEnricher(Enricher): @@ -14,13 +13,17 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def enrich(self, to_enrich: Metadata) -> None: - storages = self.config['steps']['storages'] - if not "s3_storage" in storages: + def setup(self, config: dict) -> None: + super().setup(config) + self.stores = self.config['steps']['storages'] + self.s3 = get_module("s3_storage", self.config) + if not "s3_storage" in self.stores: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return - self.s3 = get_module("s3_storage", self.config) + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 300a710..e4c214c 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -1,9 +1,7 @@ - - import os import json import uuid -from datetime import datetime +from datetime import datetime, timezone import requests from loguru import logger @@ -58,5 +56,37 @@ def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] + def json_loader(cli_val): return json.loads(cli_val) + +def get_current_datetime_iso() -> str: + return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat() + + +def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None: + # parse a datetime string with option of passing a specific format + try: + return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str) + except ValueError as e: + logger.error(f"Unable to parse datestring {dt_str}: {e}") + return None + + +def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None: + # Consistent parsing of timestamps + # If utc=True, the timezone is set to UTC, + # if iso=True, the output is an iso string + if not ts: return + try: + if isinstance(ts, str): ts = datetime.fromisoformat(ts) + if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts) + if utc: ts = ts.replace(tzinfo=timezone.utc) + if iso: return ts.isoformat() + return ts + except Exception as e: + logger.error(f"Unable to parse timestamp {ts}: {e}") + return None + +def get_current_timestamp() -> str: + return get_timestamp(datetime.now()) \ No newline at end of file diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index bdc2811..0a655a8 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -103,19 +103,20 @@ def test_failed(gsheets_db, mock_metadata, mock_gworksheet): gsheets_db.failed(mock_metadata, reason) mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}') + def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): gsheets_db.aborted(mock_metadata) mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '') def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls): - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata) mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) def test_done_cached(gsheets_db, metadata, mock_gworksheet): - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata, cached=True) # Verify the status message includes "[cached]" @@ -126,7 +127,8 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet): def test_done_missing_media(gsheets_db, metadata, mock_gworksheet): # clear media from metadata metadata.media = [] - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", + return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata) # Verify nothing media-related gets updated call_args = mock_gworksheet.batch_set_cell.call_args[0][0] diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py index d3f7bd6..c119e3f 100644 --- a/tests/extractors/test_instagram_api_extractor.py +++ b/tests/extractors/test_instagram_api_extractor.py @@ -185,5 +185,4 @@ class TestInstagramAPIExtractor(TestExtractorBase): result = self.extractor.download_profile(metadata, "test_user") assert result.is_success() - assert "Error downloading stories for test_user" in result.metadata["errors"] - # assert "Error downloading posts for test_user" in result.metadata["errors"] \ No newline at end of file + assert "Error downloading stories for test_user" in result.metadata["errors"] \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index b82641d..d7a1e53 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -1,5 +1,4 @@ import os -import pickle from typing import Type from unittest.mock import patch, MagicMock diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index 103610e..ecf57f1 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -7,10 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GsheetsFeeder from auto_archiver.core import Metadata, Feeder -def test_initialise_without_sheet_and_sheet_id(setup_module): - """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set. - (shouldn't really be asserting in there) - """ +def test_setup_without_sheet_and_sheet_id(setup_module): + # Ensure setup() raises AssertionError if neither sheet nor sheet_id is set. with patch("gspread.service_account"): with pytest.raises(AssertionError): setup_module( @@ -145,7 +143,6 @@ def test_open_sheet_with_name_or_id( "gsheet_feeder", {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, ) - feeder.initialise() sheet_result = feeder.open_sheet() # Validate the correct method was called getattr(mock_client, expected_method).assert_called_once_with( @@ -165,7 +162,6 @@ def test_open_sheet_with_sheet_id(setup_module): "gsheet_feeder", {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, ) - feeder.initialise() sheet = feeder.open_sheet() mock_client.open_by_key.assert_called_once_with("ABC123") assert sheet == "MockSheet" @@ -263,7 +259,6 @@ class TestGSheetsFeederReal: ["https://example.com", "done"], ] worksheet.append_rows(test_rows) - self.feeder.initialise() metadata_list = list(self.feeder) # Validate that only the first row is processed diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index b7417ad..4259cb2 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -21,16 +21,6 @@ class TestGDriveStorage(TestStorageBase): 'service_account': 'fake_service_account.json' } - @pytest.mark.skip(reason="Requires real credentials") - @pytest.mark.download - def test_initialize_with_real_credentials(self): - """ - Test that the Google Drive service can be initialized with real credentials. - """ - self.storage.service_account = 'secrets/service_account.json' # Path to real credentials - self.storage.initialise() - assert self.storage.service is not None - def test_initialize_fails_with_non_existent_creds(self): """ @@ -38,6 +28,35 @@ class TestGDriveStorage(TestStorageBase): """ # Act and Assert with pytest.raises(FileNotFoundError) as exc_info: - self.storage.initialise() + self.storage.setup(self.config) assert "No such file or directory" in str(exc_info.value) + def test_path_parts(self): + media = Media(filename="test.jpg") + media.key = "folder1/folder2/test.jpg" + +# @pytest.mark.skip(reason="Requires real credentials") +@pytest.mark.download +class TestGDriveStorageConnected(TestStorageBase): + """ + 'Real' tests for GDriveStorage. + """ + + module_name: str = "gdrive_storage" + storage: Type[GDriveStorage] + config: dict = {'path_generator': 'url', + 'filename_generator': 'static', + # TODO: replace with real root folder id + 'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk", + 'oauth_token': None, + 'service_account': 'secrets/service_account.json' + } + + + def test_initialize_with_real_credentials(self): + """ + Test that the Google Drive service can be initialized with real credentials. + """ + assert self.storage.service is not None + + diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 7270c80..b07e107 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -159,3 +159,7 @@ def test_get_context(): assert m.get_context("somekey") == "somevalue" assert m.get_context("anotherkey") == "anothervalue" assert len(m._context) == 2 + + +def test_choose_most_complete(): + pass \ No newline at end of file From 2c3d1f591f4a721597e2cd9906c1cdc05db8a78e Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 17:25:15 +0000 Subject: [PATCH 08/17] Separate setup() and module_setup(). --- src/auto_archiver/core/base_module.py | 4 ++++ src/auto_archiver/core/module.py | 1 + src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 4 +--- src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py | 3 +-- src/auto_archiver/modules/html_formatter/html_formatter.py | 3 +-- .../instagram_api_extractor/instagram_api_extractor.py | 3 +-- .../modules/instagram_extractor/instagram_extractor.py | 3 +-- .../instagram_tbot_extractor/instagram_tbot_extractor.py | 3 +-- src/auto_archiver/modules/s3_storage/s3_storage.py | 3 +-- .../modules/telethon_extractor/telethon_extractor.py | 3 +-- .../modules/twitter_api_extractor/twitter_api_extractor.py | 4 +--- src/auto_archiver/modules/vk_extractor/vk_extractor.py | 3 +-- src/auto_archiver/modules/wacz_enricher/wacz_enricher.py | 3 +-- .../modules/whisper_enricher/whisper_enricher.py | 3 +-- 14 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 5c6ecbb..95575e3 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -80,6 +80,10 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) + def module_setup(self): + # For any additional setup required by modules, e.g. autehntication + pass + def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]: """ Returns the authentication information for a given site. This is used to authenticate diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index f3fbec5..69f9fcc 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -242,6 +242,7 @@ class LazyBaseModule: default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) config[self.name] = default_config | config.get(self.name, {}) instance.setup(config) + instance.module_setup() return instance def __repr__(self): diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 910f48b..51c13c2 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -19,9 +19,7 @@ from auto_archiver.core import Storage class GDriveStorage(Storage): - def setup(self, config: dict) -> None: - # Step 1: Call the BaseModule setup to dynamically assign configs - super().setup(config) + def module_setup(self) -> None: self.scopes = ['https://www.googleapis.com/auth/drive'] # Initialize Google Drive service self._setup_google_drive_service() diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 50bf430..dd98032 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -21,8 +21,7 @@ from . import GWorksheet class GsheetsFeeder(Feeder): - def setup(self, config: dict): - super().setup(config) + def module_setup(self) -> None: self.gsheets_client = gspread.service_account(filename=self.service_account) # TODO mv to validators assert self.sheet or self.sheet_id, ( diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 4da82c8..bbba097 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -17,9 +17,8 @@ class HtmlFormatter(Formatter): environment: Environment = None template: any = None - def setup(self, config: dict) -> None: + def module_setup(self) -> None: """Sets up the Jinja2 environment and loads the template.""" - super().setup(config) # Ensure the base class logic is executed template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/") self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True) diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 5dad0ba..367cc75 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -32,8 +32,7 @@ class InstagramAPIExtractor(Extractor): r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 3cf0362..e4e210f 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -25,8 +25,7 @@ class InstagramExtractor(Extractor): profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url)) # TODO: links to stories - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.insta = instaloader.Instaloader( download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 5660cd2..707dcc3 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -27,12 +27,11 @@ class InstagramTbotExtractor(Extractor): https://t.me/instagram_load_bot """ - def setup(self, configs) -> None: + def module_setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. checks if the session file is valid """ - super().setup(configs) logger.info(f"SETUP {self.name} checking login...") self._prepare_session_file() self._initialize_telegram_client() diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index 2f85164..c77bbc3 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -13,8 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/" class S3Storage(Storage): - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.s3 = boto3.client( 's3', region_name=self.region, diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 97d3e94..3762f01 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,14 +18,13 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def setup(self, config: dict) -> None: + def module_setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. trigger login process for telegram or proceed if already saved in a session file 3. joins channel_invites where needed """ - super().setup(config) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 6573475..0b27e22 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -15,9 +15,7 @@ class TwitterApiExtractor(Extractor): valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - def setup(self, config: dict) -> None: - super().setup(config) - + def module_setup(self) -> None: self.api_index = 0 self.apis = [] if len(self.bearer_tokens): diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 2d09138..0d1fc04 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -12,8 +12,7 @@ class VkExtractor(Extractor): Currently only works for /wall posts """ - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.vks = VkScraper(self.username, self.password, session_file=self.session_file) def download(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 1586b75..7d91f43 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -18,8 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor): When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - def setup(self, configs) -> None: - super().setup(configs) + def module_setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a51ffc1..d83319e 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -13,8 +13,7 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.stores = self.config['steps']['storages'] self.s3 = get_module("s3_storage", self.config) if not "s3_storage" in self.stores: From e97ccf8a736fc6bd01a0efdf9a54c8cca16d5d97 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 18:07:47 +0000 Subject: [PATCH 09/17] Separate setup() and module_setup(). --- src/auto_archiver/core/base_module.py | 6 +++--- src/auto_archiver/core/module.py | 6 +++--- src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 2 +- src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py | 2 +- src/auto_archiver/modules/html_formatter/html_formatter.py | 2 +- .../instagram_api_extractor/instagram_api_extractor.py | 2 +- .../modules/instagram_extractor/instagram_extractor.py | 2 +- .../instagram_tbot_extractor/instagram_tbot_extractor.py | 2 +- src/auto_archiver/modules/s3_storage/s3_storage.py | 2 +- .../modules/telethon_extractor/telethon_extractor.py | 2 +- .../modules/twitter_api_extractor/twitter_api_extractor.py | 2 +- src/auto_archiver/modules/vk_extractor/vk_extractor.py | 2 +- src/auto_archiver/modules/wacz_enricher/wacz_enricher.py | 2 +- .../modules/whisper_enricher/whisper_enricher.py | 2 +- 14 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 95575e3..ece4719 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -14,7 +14,7 @@ class BaseModule(ABC): Base module class. All modules should inherit from this class. The exact methods a class implements will depend on the type of module it is, - however all modules have a .setup(config: dict) method to run any setup code + however modules can have a .setup() method to run any setup code (e.g. logging in to a site, spinning up a browser etc.) See BaseModule.MODULE_TYPES for the types of modules you can create, noting that @@ -60,7 +60,7 @@ class BaseModule(ABC): def storages(self) -> list: return self.config.get('storages', []) - def setup(self, config: dict): + def config_setup(self, config: dict): authentication = config.get('authentication', {}) # extract out concatenated sites @@ -80,7 +80,7 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) - def module_setup(self): + def setup(self): # For any additional setup required by modules, e.g. autehntication pass diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 69f9fcc..c81e26a 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -58,7 +58,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa This has all the information about the module, but does not load the module itself or its dependencies - To load an actual module, call .setup() on a laz module + To load an actual module, call .setup() on a lazy module """ if module_name in _LAZY_LOADED_MODULES: @@ -241,8 +241,8 @@ class LazyBaseModule: # merge the default config with the user config default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) config[self.name] = default_config | config.get(self.name, {}) - instance.setup(config) - instance.module_setup() + instance.config_setup(config) + instance.setup() return instance def __repr__(self): diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 51c13c2..f38feb6 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -19,7 +19,7 @@ from auto_archiver.core import Storage class GDriveStorage(Storage): - def module_setup(self) -> None: + def setup(self) -> None: self.scopes = ['https://www.googleapis.com/auth/drive'] # Initialize Google Drive service self._setup_google_drive_service() diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index dd98032..8612d02 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -21,7 +21,7 @@ from . import GWorksheet class GsheetsFeeder(Feeder): - def module_setup(self) -> None: + def setup(self) -> None: self.gsheets_client = gspread.service_account(filename=self.service_account) # TODO mv to validators assert self.sheet or self.sheet_id, ( diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index bbba097..3691735 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -17,7 +17,7 @@ class HtmlFormatter(Formatter): environment: Environment = None template: any = None - def module_setup(self) -> None: + def setup(self) -> None: """Sets up the Jinja2 environment and loads the template.""" template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/") self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True) diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 367cc75..a75e065 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -32,7 +32,7 @@ class InstagramAPIExtractor(Extractor): r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) - def module_setup(self) -> None: + def setup(self) -> None: if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index e4e210f..0af2c32 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -25,7 +25,7 @@ class InstagramExtractor(Extractor): profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url)) # TODO: links to stories - def module_setup(self) -> None: + def setup(self) -> None: self.insta = instaloader.Instaloader( download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 707dcc3..d4b7a8e 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -27,7 +27,7 @@ class InstagramTbotExtractor(Extractor): https://t.me/instagram_load_bot """ - def module_setup(self) -> None: + def setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. checks if the session file is valid diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index c77bbc3..6590ac9 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -13,7 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/" class S3Storage(Storage): - def module_setup(self) -> None: + def setup(self) -> None: self.s3 = boto3.client( 's3', region_name=self.region, diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 3762f01..65ea8cd 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,7 +18,7 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def module_setup(self) -> None: + def setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 0b27e22..72fd2f2 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -15,7 +15,7 @@ class TwitterApiExtractor(Extractor): valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - def module_setup(self) -> None: + def setup(self) -> None: self.api_index = 0 self.apis = [] if len(self.bearer_tokens): diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 0d1fc04..99527c4 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -12,7 +12,7 @@ class VkExtractor(Extractor): Currently only works for /wall posts """ - def module_setup(self) -> None: + def setup(self) -> None: self.vks = VkScraper(self.username, self.password, session_file=self.session_file) def download(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 7d91f43..c324c62 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -18,7 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor): When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - def module_setup(self) -> None: + def setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index d83319e..89579f9 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -13,7 +13,7 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def module_setup(self) -> None: + def setup(self) -> None: self.stores = self.config['steps']['storages'] self.s3 = get_module("s3_storage", self.config) if not "s3_storage" in self.stores: From 3dae2337a1e3a97b913780b58e45adbc1d0aff5a Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 18:56:46 +0000 Subject: [PATCH 10/17] remove cdn_url check before storage. --- src/auto_archiver/core/media.py | 2 +- src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 952a025..b6820ab 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -65,7 +65,7 @@ class Media: def is_stored(self, in_storage) -> bool: # checks if the media is already stored in the given storage - return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url(self) in u]) + return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"]) def set(self, key: str, value: Any) -> Media: self.properties[key] = value diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index f38feb6..4971030 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -68,11 +68,10 @@ class GDriveStorage(Storage): filename = path_parts[-1] logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") for folder in path_parts[0:-1]: - folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) + folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) parent_id = folder_id # get id of file inside folder (or sub folder) - # TODO: supressing the error as being checked before first upload - file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False) + file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True) if not file_id: # logger.info(f"file {filename} not found in folder {folder_id}") From a69ac3e509eed60f1801aca605531b6bc8f3e506 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 09:46:22 +0000 Subject: [PATCH 11/17] Fix file hash reference in S3 tests --- tests/storages/test_S3_storage.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index 2594e73..e532a18 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -2,13 +2,12 @@ from typing import Type import pytest from unittest.mock import MagicMock, patch from auto_archiver.core import Media -from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.s3_storage import s3_storage -class TestGDriveStorage: +class TestS3Storage: """ - Test suite for GDriveStorage. + Test suite for S3Storage. """ module_name: str = "s3_storage" storage: Type[s3_storage] @@ -66,7 +65,7 @@ class TestGDriveStorage: # Set duplicate checking config to true: self.storage.random_no_duplicate = True - with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calc_hash, \ + with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calc_hash, \ patch.object(self.storage, 'file_in_folder') as mock_file_in_folder: mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123' mock_file_in_folder.return_value = 'existing_key.txt' @@ -87,8 +86,7 @@ class TestGDriveStorage: # Create test media with calculated hash media = Media("test.txt") media.key = "original_path.txt" - - with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calculate_hash: + with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calculate_hash: mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123" # Verify upload assert self.storage.is_upload_needed(media) is False From 18666ff027526b99114d2b4ffb6304f9b3a83461 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:28:24 +0000 Subject: [PATCH 12/17] skip authenticated tests in test_gsheet_feeder.py --- tests/feeders/test_gsheet_feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index ecf57f1..bdf3e70 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -185,7 +185,7 @@ def test_should_process_sheet(setup_module): assert gdb.should_process_sheet("AnotherSheet") == False -# @pytest.mark.skip(reason="Requires a real connection") +@pytest.mark.skip(reason="Requires a real connection") class TestGSheetsFeederReal: """Testing GSheetsFeeder class""" From 1792e02d1d32c99ca1a59aeb0cab33a74d3a783e Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:34:36 +0000 Subject: [PATCH 13/17] skip authenticated tests in test_gdrive_storage.py --- tests/storages/test_gdrive_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index 4259cb2..57480d0 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -35,7 +35,7 @@ class TestGDriveStorage(TestStorageBase): media = Media(filename="test.jpg") media.key = "folder1/folder2/test.jpg" -# @pytest.mark.skip(reason="Requires real credentials") +@pytest.mark.skip(reason="Requires real credentials") @pytest.mark.download class TestGDriveStorageConnected(TestStorageBase): """ From 89d9140d15eb9e4261abf27f9c71df47ef8efb07 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:47:11 +0000 Subject: [PATCH 14/17] Fixed setup/ config_setup reference --- tests/storages/test_gdrive_storage.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index 57480d0..aba0a25 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -7,7 +7,7 @@ from auto_archiver.core.metadata import Metadata from tests.storages.test_storage_base import TestStorageBase -class TestGDriveStorage(TestStorageBase): +class TestGDriveStorage: """ Test suite for GDriveStorage. """ @@ -21,6 +21,10 @@ class TestGDriveStorage(TestStorageBase): 'service_account': 'fake_service_account.json' } + @pytest.fixture(autouse=True) + def gdrive(self, setup_module): + with patch('google.oauth2.service_account.Credentials.from_service_account_file') as mock_creds: + self.storage = setup_module(self.module_name, self.config) def test_initialize_fails_with_non_existent_creds(self): """ @@ -28,13 +32,15 @@ class TestGDriveStorage(TestStorageBase): """ # Act and Assert with pytest.raises(FileNotFoundError) as exc_info: - self.storage.setup(self.config) + self.storage.setup() assert "No such file or directory" in str(exc_info.value) + def test_path_parts(self): media = Media(filename="test.jpg") media.key = "folder1/folder2/test.jpg" + @pytest.mark.skip(reason="Requires real credentials") @pytest.mark.download class TestGDriveStorageConnected(TestStorageBase): From f97ec6a9e0ac20268f045b661f2e080ff1eb8574 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 11:58:28 +0000 Subject: [PATCH 15/17] Fixed S3 module import --- tests/storages/test_S3_storage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index e532a18..2a5d026 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -2,7 +2,7 @@ from typing import Type import pytest from unittest.mock import MagicMock, patch from auto_archiver.core import Media -from auto_archiver.modules.s3_storage import s3_storage +from auto_archiver.modules.s3_storage import S3Storage class TestS3Storage: @@ -10,7 +10,7 @@ class TestS3Storage: Test suite for S3Storage. """ module_name: str = "s3_storage" - storage: Type[s3_storage] + storage: Type[S3Storage] s3: MagicMock config: dict = { "path_generator": "flat", @@ -78,7 +78,7 @@ class TestS3Storage: ) - @patch.object(s3_storage.S3Storage, 'file_in_folder') + @patch.object(S3Storage, 'file_in_folder') def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): """Test that upload skips when file_in_folder finds existing object""" self.storage.random_no_duplicate = True @@ -97,7 +97,7 @@ class TestS3Storage: mock_upload.assert_not_called() assert result is True - @patch.object(s3_storage.S3Storage, 'is_upload_needed') + @patch.object(S3Storage, 'is_upload_needed') def test_uploads_with_correct_parameters(self, mock_upload_needed): media = Media("test.txt") media.key = "original_key.txt" From 5e2e93382ffc47893183aae83ff138055b0edeb8 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 12:17:42 +0000 Subject: [PATCH 16/17] Test fixes for 3.10 compliance. --- tests/databases/test_gsheet_db.py | 2 +- tests/feeders/test_gsheet_feeder.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 0a655a8..32e8403 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -24,7 +24,7 @@ def mock_metadata(): metadata.status = "done" metadata.get_title.return_value = "Example Title" metadata.get.return_value = "Example Content" - metadata.get_timestamp.return_value = "2025-01-01T00:00:00Z" + metadata.get_timestamp.return_value = "2025-01-01T00:00:00" metadata.get_final_media.return_value = MagicMock(spec=Media) metadata.get_all_media.return_value = [] metadata.get_media_by_id.return_value = None diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index bdf3e70..b86e329 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -52,7 +52,7 @@ def gsheet_feeder(setup_module) -> GsheetsFeeder: return feeder -class TestWorksheet: +class MockWorksheet: """ mimics the bits we need from gworksheet """ @@ -91,7 +91,7 @@ class TestWorksheet: def test__process_rows(gsheet_feeder: GsheetsFeeder): - testworksheet = TestWorksheet() + testworksheet = MockWorksheet() metadata_items = list(gsheet_feeder._process_rows(testworksheet)) assert len(metadata_items) == 3 assert isinstance(metadata_items[0], Metadata) @@ -99,7 +99,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): def test__set_metadata(gsheet_feeder: GsheetsFeeder): - worksheet = TestWorksheet() + worksheet = MockWorksheet() metadata = Metadata() gsheet_feeder._set_context(metadata, worksheet, 1) assert metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} @@ -112,7 +112,7 @@ def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, workshe def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): - testworksheet = TestWorksheet() + testworksheet = MockWorksheet() metadata = Metadata() testworksheet.wks.title = "TestSheet" gsheet_feeder._set_context(metadata, testworksheet, 6) From d1d6cde008861f508b8689ff6fd30cdde2fccd3a Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 11 Feb 2025 12:27:48 +0000 Subject: [PATCH 17/17] Set mock timestamp without z format --- tests/databases/test_gsheet_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 32e8403..18a22f1 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -41,7 +41,7 @@ def metadata(): metadata.set_title("Example Title") metadata.set_content("Example Content") metadata.success("my-archiver") - metadata.set("timestamp", "2025-01-01T00:00:00Z") + metadata.set("timestamp", "2025-01-01T00:00:00") metadata.set("date", "2025-02-04T18:22:24.909112+00:00") return metadata