From 52542812dcbd171f1606a4f7502becb1101bd570 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Wed, 5 Feb 2025 16:42:58 +0000
Subject: [PATCH 01/17] Merge tests from version with context.

---
 .../modules/gsheet_db/gsheet_db.py            |  15 +-
 .../instagram_tbot_extractor.py               |  80 ++++--
 .../modules/telethon_extractor/__init__.py    |   2 +-
 .../telethon_extractor/telethon_extractor.py  |   2 +-
 tests/conftest.py                             |  19 +-
 tests/databases/test_gsheet_db.py             | 140 +++++++++
 .../test_instagram_api_extractor.py           | 108 +++++++
 .../test_instagram_tbot_extractor.py          | 111 ++++++++
 tests/feeders/test_gsheet_feeder.py           | 268 ++++++++++++++++++
 tests/feeders/test_gworksheet.py              | 144 ++++++++++
 tests/storages/test_S3_storage.py             | 100 +++++++
 tests/storages/test_gdrive_storage.py         |  43 +++
 tests/storages/test_storage_base.py           |  23 ++
 13 files changed, 1022 insertions(+), 33 deletions(-)
 create mode 100644 tests/databases/test_gsheet_db.py
 create mode 100644 tests/extractors/test_instagram_api_extractor.py
 create mode 100644 tests/extractors/test_instagram_tbot_extractor.py
 create mode 100644 tests/feeders/test_gsheet_feeder.py
 create mode 100644 tests/feeders/test_gworksheet.py
 create mode 100644 tests/storages/test_S3_storage.py
 create mode 100644 tests/storages/test_gdrive_storage.py
 create mode 100644 tests/storages/test_storage_base.py

diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
index 5e1ed1e..644015e 100644
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -12,10 +12,11 @@ from auto_archiver.modules.gsheet_feeder import GWorksheet
 
 class GsheetsDb(Database):
     """
-        NB: only works if GsheetFeeder is used. 
-        could be updated in the future to support non-GsheetFeeder metadata 
+        NB: only works if GsheetFeeder is used.
+        could be updated in the future to support non-GsheetFeeder metadata
     """
 
+
     def started(self, item: Metadata) -> None:
         logger.warning(f"STARTED {item}")
         gw, row = self._retrieve_gsheet(item)
@@ -57,7 +58,7 @@ class GsheetsDb(Database):
         media: Media = item.get_final_media()
         if hasattr(media, "urls"):
             batch_if_valid('archive', "\n".join(media.urls))
-        batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
+        batch_if_valid('date', True, self._get_current_datetime_iso())
         batch_if_valid('title', item.get_title())
         batch_if_valid('text', item.get("content", ""))
         batch_if_valid('timestamp', item.get_timestamp())
@@ -85,6 +86,12 @@ class GsheetsDb(Database):
 
         gw.batch_set_cell(cell_updates)
 
+    @staticmethod
+    def _get_current_datetime_iso() -> str:
+        """Helper method to generate the current datetime in ISO format."""
+        return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()
+
+
     def _safe_status_update(self, item: Metadata, new_status: str) -> None:
         try:
             gw, row = self._retrieve_gsheet(item)
@@ -93,9 +100,11 @@ class GsheetsDb(Database):
             logger.debug(f"Unable to update sheet: {e}")
 
     def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
+
         if gsheet := item.get_context("gsheet"):
             gw: GWorksheet = gsheet.get("worksheet")
             row: int = gsheet.get("row")
+        # todo doesn't exist, should be passed from
         elif self.sheet_id:
             print(self.sheet_id)
 
diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
index 5b49484..5660cd2 100644
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -34,19 +34,30 @@ class InstagramTbotExtractor(Extractor):
         """
         super().setup(configs)
         logger.info(f"SETUP {self.name} checking login...")
+        self._prepare_session_file()
+        self._initialize_telegram_client()
 
-        # make a copy of the session that is used exclusively with this archiver instance
+    def _prepare_session_file(self):
+        """
+        Creates a copy of the session file for exclusive use with this archiver instance.
+        Ensures that a valid session file exists before proceeding.
+        """
         new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
         if not os.path.exists(f"{self.session_file}.session"):
-            raise FileNotFoundError(f"session file {self.session_file}.session not found, "
-                                    f"to set this up run the setup script in scripts/telegram_setup.py")
+            raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
         shutil.copy(self.session_file + ".session", new_session_file)
         self.session_file = new_session_file.replace(".session", "")
 
+    def _initialize_telegram_client(self):
+        """Initializes the Telegram client."""
         try:
             self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
         except OperationalError as e:
-            logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
+            logger.error(
+                f"Unable to access the {self.session_file} session. "
+                "Ensure that you don't use the same session file here and in telethon_extractor. "
+                "If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
+            )
         with self.client.start():
             logger.success(f"SETUP {self.name} login works.")
 
@@ -63,32 +74,49 @@ class InstagramTbotExtractor(Extractor):
         result = Metadata()
         tmp_dir = self.tmp_dir
         with self.client.start():
-            chat = self.client.get_entity("instagram_load_bot")
-            since_id = self.client.send_message(entity=chat, message=url).id
 
-            attempts = 0
-            seen_media = []
-            message = ""
-            time.sleep(3)
-            # media is added before text by the bot so it can be used as a stop-logic mechanism
-            while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
-                attempts += 1
-                time.sleep(1)
-                for post in self.client.iter_messages(chat, min_id=since_id):
-                    since_id = max(since_id, post.id)
-                    if post.media and post.id not in seen_media:
-                        filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
-                        media = self.client.download_media(post.media, filename_dest)
-                        if media: 
-                            result.add_media(Media(media))
-                            seen_media.append(post.id)
-                    if post.message: message += post.message
+            chat, since_id = self._send_url_to_bot(url)
+            message = self._process_messages(chat, since_id, tmp_dir, result)
 
-            if "You must enter a URL to a post" in message: 
+            if "You must enter a URL to a post" in message:
                 logger.debug(f"invalid link {url=} for {self.name}: {message}")
                 return False
-                
+            # # TODO: It currently returns this as a success - is that intentional?
+            # if "Media not found or unavailable" in message:
+            #     logger.debug(f"invalid link {url=} for {self.name}: {message}")
+            #     return False
+
             if message:
                 result.set_content(message).set_title(message[:128])
-
             return result.success("insta-via-bot")
+
+    def _send_url_to_bot(self, url: str):
+        """
+        Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
+        """
+        chat = self.client.get_entity("instagram_load_bot")
+        since_message = self.client.send_message(entity=chat, message=url)
+        return chat, since_message.id
+
+    def _process_messages(self, chat, since_id, tmp_dir, result):
+        attempts = 0
+        seen_media = []
+        message = ""
+        time.sleep(3)
+        # media is added before text by the bot so it can be used as a stop-logic mechanism
+        while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
+            attempts += 1
+            time.sleep(1)
+            for post in self.client.iter_messages(chat, min_id=since_id):
+                since_id = max(since_id, post.id)
+                # Skip known filler message:
+                if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
+                    continue
+                if post.media and post.id not in seen_media:
+                    filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
+                    media = self.client.download_media(post.media, filename_dest)
+                    if media:
+                        result.add_media(Media(media))
+                        seen_media.append(post.id)
+                if post.message: message += post.message
+        return message.strip()
\ No newline at end of file
diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py
index a837fdf..2eaa57c 100644
--- a/src/auto_archiver/modules/telethon_extractor/__init__.py
+++ b/src/auto_archiver/modules/telethon_extractor/__init__.py
@@ -1 +1 @@
-from .telethon_extractor import TelethonArchiver
\ No newline at end of file
+from .telethon_extractor import TelethonExtractor
\ No newline at end of file
diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
index 3e952e8..0147ff2 100644
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -13,7 +13,7 @@ from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str
 
 
-class TelethonArchiver(Extractor):
+class TelethonExtractor(Extractor):
     valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
     invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
 
diff --git a/tests/conftest.py b/tests/conftest.py
index f909bfb..8675fbc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,8 @@
 """
 pytest conftest file, for shared fixtures and configuration
 """
-
+import os
+import pickle
 from tempfile import TemporaryDirectory
 from typing import Dict, Tuple
 import hashlib
@@ -113,4 +114,18 @@ def pytest_runtest_setup(item):
             test_name = _test_failed_incremental[cls_name].get((), None)
             # if name found, test has failed for the combination of class name & test name
             if test_name is not None:
-                pytest.xfail(f"previous test failed ({test_name})")
\ No newline at end of file
+                pytest.xfail(f"previous test failed ({test_name})")
+
+
+
+@pytest.fixture()
+def unpickle():
+    """
+    Returns a helper function that unpickles a file
+    ** gets the file from the test_files directory: tests/data/test_files **
+    """
+    def _unpickle(path):
+        test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files")
+        with open(os.path.join(test_data_dir, path), "rb") as f:
+            return pickle.load(f)
+    return _unpickle
\ No newline at end of file
diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py
new file mode 100644
index 0000000..bdc2811
--- /dev/null
+++ b/tests/databases/test_gsheet_db.py
@@ -0,0 +1,140 @@
+from datetime import datetime, timezone
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from auto_archiver.core import Metadata, Media
+from auto_archiver.modules.gsheet_db import GsheetsDb
+from auto_archiver.modules.gsheet_feeder import GWorksheet
+
+
+@pytest.fixture
+def mock_gworksheet():
+    mock_gworksheet = MagicMock(spec=GWorksheet)
+    mock_gworksheet.col_exists.return_value = True
+    mock_gworksheet.get_cell.return_value = ""
+    mock_gworksheet.get_row.return_value = {}
+    return mock_gworksheet
+
+
+@pytest.fixture
+def mock_metadata():
+    metadata: Metadata = MagicMock(spec=Metadata)
+    metadata.get_url.return_value = "http://example.com"
+    metadata.status = "done"
+    metadata.get_title.return_value = "Example Title"
+    metadata.get.return_value = "Example Content"
+    metadata.get_timestamp.return_value = "2025-01-01T00:00:00Z"
+    metadata.get_final_media.return_value = MagicMock(spec=Media)
+    metadata.get_all_media.return_value = []
+    metadata.get_media_by_id.return_value = None
+    metadata.get_first_image.return_value = None
+    return metadata
+
+@pytest.fixture
+def metadata():
+    metadata = Metadata()
+    metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"]))
+    metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]))
+    metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"]))
+    metadata.set_url("http://example.com")
+    metadata.set_title("Example Title")
+    metadata.set_content("Example Content")
+    metadata.success("my-archiver")
+    metadata.set("timestamp", "2025-01-01T00:00:00Z")
+    metadata.set("date", "2025-02-04T18:22:24.909112+00:00")
+    return metadata
+
+
+@pytest.fixture
+def mock_media():
+    """Fixture for a mock Media object."""
+    mock_media = MagicMock(spec=Media)
+    mock_media.urls = ["http://example.com/media"]
+    mock_media.get.return_value = "not-calculated"
+    return mock_media
+
+@pytest.fixture
+def gsheets_db(mock_gworksheet, setup_module):
+    db = setup_module("gsheet_db", {
+        "allow_worksheets": "set()",
+        "block_worksheets": "set()",
+        "use_sheet_names_in_stored_paths": "True",
+    })
+    db._retrieve_gsheet = MagicMock(return_value=(mock_gworksheet, 1))
+    return db
+
+
+@pytest.fixture
+def fixed_timestamp():
+    """Fixture for a fixed timestamp."""
+    return datetime(2025, 1, 1, tzinfo=timezone.utc)
+
+
+@pytest.fixture
+def expected_calls(mock_media, fixed_timestamp):
+    """Fixture for the expected cell updates."""
+    return  [
+        (1, 'status', 'my-archiver: success'),
+        (1, 'archive', 'http://example.com/screenshot.png'),
+        (1, 'date', '2025-02-01T00:00:00+00:00'),
+        (1, 'title', 'Example Title'),
+        (1, 'text', 'Example Content'),
+        (1, 'timestamp', '2025-01-01T00:00:00+00:00'),
+        (1, 'hash', 'not-calculated'),
+        # (1, 'screenshot', 'http://example.com/screenshot.png'),
+        # (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'),
+        # (1, 'wacz', 'http://example.com/browsertrix.wacz'),
+        # (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=')
+    ]
+
+def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
+    gw, row = gsheets_db._retrieve_gsheet(metadata)
+    assert gw == mock_gworksheet
+    assert row == 1
+
+
+def test_started(gsheets_db, mock_metadata, mock_gworksheet):
+    gsheets_db.started(mock_metadata)
+    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Archive in progress')
+
+def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
+    reason = "Test failure"
+    gsheets_db.failed(mock_metadata, reason)
+    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
+
+def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
+    gsheets_db.aborted(mock_metadata)
+    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
+
+
+def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls):
+    with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
+        gsheets_db.done(metadata)
+    mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
+
+
+def test_done_cached(gsheets_db, metadata, mock_gworksheet):
+    with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
+        gsheets_db.done(metadata, cached=True)
+
+    # Verify the status message includes "[cached]"
+    call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
+    assert any(call[2].startswith("[cached]") for call in call_args)
+
+
+def test_done_missing_media(gsheets_db, metadata, mock_gworksheet):
+    # clear media from metadata
+    metadata.media = []
+    with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
+        gsheets_db.done(metadata)
+    # Verify nothing media-related gets updated
+    call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
+    media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'}
+    assert all(call[1] not in media_fields for call in call_args)
+
+def test_safe_status_update(gsheets_db, metadata, mock_gworksheet):
+    gsheets_db._safe_status_update(metadata, "Test status")
+    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Test status')
+
+
diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py
new file mode 100644
index 0000000..7a19233
--- /dev/null
+++ b/tests/extractors/test_instagram_api_extractor.py
@@ -0,0 +1,108 @@
+from datetime import datetime
+from typing import Type
+
+import pytest
+from unittest.mock import patch, MagicMock
+
+from auto_archiver.core import Metadata
+from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor import InstagramAPIExtractor
+from .test_extractor_base import TestExtractorBase
+
+
+@pytest.fixture
+def mock_user_response():
+    return {
+        "user": {
+            "pk": "123",
+            "username": "test_user",
+            "full_name": "Test User",
+            "profile_pic_url_hd": "http://example.com/profile.jpg",
+            "profile_pic_url": "http://example.com/profile_lowres.jpg"
+        }
+    }
+
+@pytest.fixture
+def mock_post_response():
+    return {
+        "id": "post_123",
+        "code": "abc123",
+        "caption_text": "Test Caption",
+        "taken_at": datetime.now().timestamp(),
+        "video_url": "http://example.com/video.mp4",
+        "thumbnail_url": "http://example.com/thumbnail.jpg"
+    }
+
+@pytest.fixture
+def mock_story_response():
+    return [{
+        "id": "story_123",
+        "taken_at": datetime.now().timestamp(),
+        "video_url": "http://example.com/story.mp4"
+    }]
+
+@pytest.fixture
+def mock_highlight_response():
+    return {
+        "response": {
+            "reels": {
+                "highlight:123": {
+                    "id": "123",
+                    "title": "Test Highlight",
+                    "items": [{
+                        "id": "item_123",
+                        "taken_at": datetime.now().timestamp(),
+                        "video_url": "http://example.com/highlight.mp4"
+                    }]
+                }
+            }
+        }
+    }
+
+
+# @pytest.mark.incremental
+class TestInstagramAPIExtractor(TestExtractorBase):
+    """
+    Test suite for InstagramAPIExtractor.
+    """
+
+    extractor_module = "instagram_api_extractor"
+    extractor: InstagramAPIExtractor
+
+    config = {
+        "access_token": "test_access_token",
+        "api_endpoint": "https://api.instagram.com/v1",
+        # "full_profile": False,
+        # "full_profile_max_posts": 0,
+        # "minimize_json_output": True,
+    }
+
+    @pytest.mark.parametrize("url,expected", [
+        ("https://instagram.com/user", [("", "user", "")]),
+        ("https://instagr.am/p/post_id", []),
+        ("https://youtube.com", []),
+        ("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
+        ("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
+        ("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
+    ])
+    def test_url_parsing(self, url, expected):
+        assert self.extractor.valid_url.findall(url) == expected
+
+    def test_initialize(self):
+        self.extractor.initialise()
+        assert self.extractor.api_endpoint[-1] != "/"
+
+    @pytest.mark.parametrize("input_dict,expected", [
+        ({"x": 0, "valid": "data"}, {"valid": "data"}),
+        ({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
+    ])
+    def test_cleanup_dict(self, input_dict, expected):
+        assert self.extractor.cleanup_dict(input_dict) == expected
+
+    def test_download_post(self):
+        # test with context=reel
+        # test with context=post
+        # test with multiple images
+        # test gets text (metadata title)
+
+
+        pass
\ No newline at end of file
diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py
new file mode 100644
index 0000000..4fe80be
--- /dev/null
+++ b/tests/extractors/test_instagram_tbot_extractor.py
@@ -0,0 +1,111 @@
+import os
+import pickle
+from typing import Type
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from auto_archiver.core.extractor import Extractor
+from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor
+
+
+TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles")
+
+
+@pytest.fixture
+def test_session_file(tmpdir):
+    """Fixture to create a test session file."""
+    session_file = os.path.join(tmpdir, "test_session.session")
+    with open(session_file, "w") as f:
+        f.write("mock_session_data")
+    return session_file.replace(".session", "")
+
+
+@pytest.mark.incremental
+class TestInstagramTbotExtractor(object):
+    """
+    Test suite for InstagramTbotExtractor.
+    """
+
+    extractor_module = "instagram_tbot_extractor"
+    extractor: InstagramTbotExtractor
+    config = {
+        "api_id": 12345,
+        "api_hash": "test_api_hash",
+        # "session_file"
+    }
+
+    @pytest.fixture(autouse=True)
+    def setup_extractor(self, setup_module):
+        assert self.extractor_module is not None, "self.extractor_module must be set on the subclass"
+        assert self.config is not None, "self.config must be a dict set on the subclass"
+        extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
+        return extractor
+
+    @pytest.fixture
+    def mock_telegram_client(self):
+        """Fixture to mock TelegramClient interactions."""
+        with patch("auto_archiver.modules.instagram_tbot_extractor._initialize_telegram_client") as mock_client:
+            instance = MagicMock()
+            mock_client.return_value = instance
+            yield instance
+
+
+    # @pytest.fixture
+    # def mock_session_file(self, temp_session_file):
+    #     """Patch the extractor’s session file setup to use a temporary path."""
+    #     with patch.object(InstagramTbotExtractor, "session_file", temp_session_file):
+    #         with patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None):
+    #             yield  # Mocks are applied for the duration of the test
+
+    @pytest.fixture
+    def metadata_sample(self):
+        """Loads a Metadata object from a pickle file."""
+        with open(os.path.join(TESTFILES, "metadata_item.pkl"), "rb") as f:
+            return pickle.load(f)
+
+
+    @pytest.mark.download
+    @pytest.mark.parametrize("url, expected_status, bot_responses", [
+        ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]),
+        ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]),
+        # todo tbot not working for stories :(
+        ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, [MagicMock(id=101, media=None, message="Media not found or unavailable")]),
+        ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []),
+        ("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]),
+    ])
+    def test_download(self, url, expected_status, bot_responses, metadata_sample):
+        """Test the `download()` method with various Instagram URLs."""
+        metadata_sample.set_url(url)
+        self.extractor.initialise()
+        result = self.extractor.download(metadata_sample)
+        if expected_status:
+            assert result.is_success()
+            assert result.status == expected_status
+            assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message]
+        else:
+            assert result is False
+        # self.extractor.cleanup()
+
+    # @patch.object(InstagramTbotExtractor, '_send_url_to_bot')
+    # @patch.object(InstagramTbotExtractor, '_process_messages')
+    # def test_download_invalid_link_returns_false(
+    #     self, mock_process, mock_send, extractor, metadata_instagram
+    # ):
+    #     # Setup Mocks
+    #     # _send_url_to_bot -> simulate it returns (chat=MagicMock, since_id=100)
+    #     mock_chat = MagicMock()
+    #     mock_send.return_value = (mock_chat, 100)
+    #     # _process_messages -> simulate it returns the text "You must enter a URL to a post"
+    #     mock_process.return_value = "You must enter a URL to a post"
+    #     result = extractor.download(metadata_instagram)
+    #     assert result is False, "Should return False if message includes 'You must enter a URL to a post'"
+
+
+
+
+        # Test story
+# Test expired story
+# Test requires login/ access (?)
+# Test post
+# Test multiple images?
\ No newline at end of file
diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py
new file mode 100644
index 0000000..dbd2416
--- /dev/null
+++ b/tests/feeders/test_gsheet_feeder.py
@@ -0,0 +1,268 @@
+from typing import Type
+
+import gspread
+import pytest
+from unittest.mock import patch, MagicMock
+from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
+from auto_archiver.core import Metadata, Feeder, ArchivingContext
+
+
+def test_initialise_without_sheet_and_sheet_id(setup_module):
+    """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set.
+        (shouldn't really be asserting in there)
+    """
+    with patch("gspread.service_account"):
+        feeder = setup_module("gsheet_feeder",
+                              {"service_account": "dummy.json",
+                               "sheet": None,
+                               "sheet_id": None})
+        with pytest.raises(AssertionError):
+            feeder.initialise()
+
+
+@pytest.fixture
+def gsheet_feeder(setup_module) -> GsheetsFeeder:
+    feeder = setup_module("gsheet_feeder",
+                          {"service_account": "dummy.json",
+                           "sheet": "test-auto-archiver",
+                           "sheet_id": None,
+                           "header": 1,
+                           "columns": {
+                               "url": "link",
+                               "status": "archive status",
+                               "folder": "destination folder",
+                               "archive": "archive location",
+                               "date": "archive date",
+                               "thumbnail": "thumbnail",
+                               "timestamp": "upload timestamp",
+                               "title": "upload title",
+                               "text": "text content",
+                               "screenshot": "screenshot",
+                               "hash": "hash",
+                               "pdq_hash": "perceptual hashes",
+                               "wacz": "wacz",
+                               "replaywebpage": "replaywebpage",
+                           },
+                           "allow_worksheets": set(),
+                           "block_worksheets": set(),
+                           "use_sheet_names_in_stored_paths": True,
+                           }
+                          )
+    feeder.gsheets_client = MagicMock()
+    return feeder
+
+
+@pytest.fixture()
+def worksheet(unpickle):
+    # Load the worksheet data from the pickle file
+    # only works for simple usage, cant reauthenticate but give structure
+    return unpickle("test_worksheet.pickle")
+
+
+class TestWorksheet():
+    """
+    mimics the bits we need from gworksheet
+    """
+
+    class SheetSheet:
+        title = "TestSheet"
+
+    rows = [
+          { "row": 2, "url": "http://example.com", "status": "", "folder": "" },
+          { "row": 3, "url": "http://example.com", "status": "", "folder": "" },
+          { "row": 4, "url": "", "status": "", "folder": "" },
+          { "row": 5, "url": "https://another.com", "status": None, "folder": "" },
+          { "row": 6, "url": "https://another.com", "status": "success", "folder": "some_folder" },
+        ]
+
+    def __init__(self):
+        self.wks = self.SheetSheet()
+
+    def count_rows(self):
+        if not self.rows:
+            return 0
+        return max(r["row"] for r in self.rows)
+
+    def get_cell(self, row, col_name, fresh=False):
+        matching = next((r for r in self.rows if r["row"] == row), {})
+        return matching.get(col_name, "")
+
+    def get_cell_or_default(self, row, col_name, default):
+        matching = next((r for r in self.rows if r["row"] == row), {})
+        return matching.get(col_name, default)
+
+def test__process_rows(gsheet_feeder: GsheetsFeeder):
+    testworksheet = TestWorksheet()
+    metadata_items = list(gsheet_feeder._process_rows(testworksheet))
+    assert len(metadata_items) == 3
+    assert isinstance(metadata_items[0], Metadata)
+    assert metadata_items[0].get("url") == "http://example.com"
+
+def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet):
+    gsheet_feeder._set_context(worksheet, 1)
+    assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet}
+
+
+@pytest.mark.skip(reason="Not recognising folder column")
+def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet):
+    gsheet_feeder._set_context(worksheet, 7)
+    assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet}
+
+
+def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
+    testworksheet = TestWorksheet()
+    testworksheet.wks.title = "TestSheet"
+    gsheet_feeder._set_context(testworksheet, 6)
+    assert ArchivingContext.get("gsheet") == {"row": 6, "worksheet": testworksheet}
+    assert ArchivingContext.get("folder") == "some-folder/test-auto-archiver/testsheet"
+
+
+@pytest.mark.usefixtures("setup_module")
+@pytest.mark.parametrize("sheet, sheet_id, expected_method, expected_arg, description", [
+    ("TestSheet", None, "open", "TestSheet", "opening by sheet name"),
+    (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID")
+])
+def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description):
+    """Ensure open_sheet() correctly opens by name or ID based on configuration."""
+    with patch("gspread.service_account") as mock_service_account:
+        mock_client = MagicMock()
+        mock_service_account.return_value = mock_client
+        mock_client.open.return_value = "MockSheet"
+        mock_client.open_by_key.return_value = "MockSheet"
+
+        # Setup module with parameterized values
+        feeder = setup_module("gsheet_feeder", {
+            "service_account": "dummy.json",
+            "sheet": sheet,
+            "sheet_id": sheet_id
+        })
+        feeder.initialise()
+        sheet_result = feeder.open_sheet()
+        # Validate the correct method was called
+        getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}"
+        assert sheet_result == "MockSheet", f"Failed: {description}"
+
+
+@pytest.mark.usefixtures("setup_module")
+def test_open_sheet_with_sheet_id(setup_module):
+    """Ensure open_sheet() correctly opens a sheet by ID."""
+    with patch("gspread.service_account") as mock_service_account:
+        mock_client = MagicMock()
+        mock_service_account.return_value = mock_client
+        mock_client.open_by_key.return_value = "MockSheet"
+        feeder = setup_module("gsheet_feeder",
+                              {"service_account": "dummy.json",
+                               "sheet": None,
+                               "sheet_id": "ABC123"})
+        feeder.initialise()
+        sheet = feeder.open_sheet()
+        mock_client.open_by_key.assert_called_once_with("ABC123")
+        assert sheet == "MockSheet"
+
+
+def test_should_process_sheet(setup_module):
+    gdb = setup_module("gsheet_feeder", {"service_account": "dummy.json",
+                                        "sheet": "TestSheet",
+                                        "sheet_id": None,
+                                        "allow_worksheets": {"TestSheet", "Sheet2"},
+                                        "block_worksheets": {"Sheet3"}}
+                      )
+    assert gdb.should_process_sheet("TestSheet") == True
+    assert gdb.should_process_sheet("Sheet3") == False
+    # False if allow_worksheets is set
+    assert gdb.should_process_sheet("AnotherSheet") == False
+
+
+
+@pytest.mark.skip
+class TestGSheetsFeederReal:
+
+    """ Testing GSheetsFeeder class  """
+    module_name: str = 'gsheet_feeder'
+    feeder: GsheetsFeeder
+    config: dict = {
+        # TODO: Create test creds
+        "service_account": "secrets/service_account.json",
+        "sheet": "test-auto-archiver",
+        "sheet_id": None,
+        "header": 1,
+        "columns": {
+                "url": "link",
+                "status": "archive status",
+                "folder": "destination folder",
+                "archive": "archive location",
+                "date": "archive date",
+                "thumbnail": "thumbnail",
+                "timestamp": "upload timestamp",
+                "title": "upload title",
+                "text": "text content",
+                "screenshot": "screenshot",
+                "hash": "hash",
+                "pdq_hash": "perceptual hashes",
+                "wacz": "wacz",
+                "replaywebpage": "replaywebpage",
+            },
+        "allow_worksheets": set(),
+        "block_worksheets": set(),
+        "use_sheet_names_in_stored_paths": True,
+    }
+
+    @pytest.fixture(autouse=True)
+    def setup_feeder(self, setup_module):
+        assert (
+            self.module_name is not None
+        ), "self.module_name must be set on the subclass"
+        assert self.config is not None, "self.config must be a dict set on the subclass"
+        self.feeder: Type[Feeder] = setup_module(
+            self.module_name, self.config
+        )
+
+    def reset_test_sheet(self):
+        """Clears test sheet and re-adds headers to ensure consistent test results."""
+        client = gspread.service_account(self.config["service_account"])
+        sheet = client.open(self.config["sheet"])
+        worksheet = sheet.get_worksheet(0)
+        worksheet.clear()
+        worksheet.append_row(["Link", "Archive Status"])
+
+    def test_initialise(self):
+        self.feeder.initialise()
+        assert hasattr(self.feeder, "gsheets_client")
+
+    @pytest.mark.download
+    def test_open_sheet_real_connection(self):
+        """Ensure open_sheet() connects to a real Google Sheets instance."""
+        self.feeder.initialise()
+        sheet = self.feeder.open_sheet()
+        assert sheet is not None, "open_sheet() should return a valid sheet instance"
+        assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method"
+
+    @pytest.mark.download
+    def test_iter_yields_metadata_real_data(self):
+        """Ensure __iter__() yields Metadata objects for real test sheet data."""
+        self.reset_test_sheet()
+        client = gspread.service_account(self.config["service_account"])
+        sheet = client.open(self.config["sheet"])
+        worksheet = sheet.get_worksheet(0)
+        # Insert test rows as a temp method
+        # Next we will refactor the feeder for better testing
+        test_rows = [
+            ["https://example.com", ""],
+            ["", ""],
+            ["https://example.com", "done"],
+        ]
+        worksheet.append_rows(test_rows)
+        self.feeder.initialise()
+        metadata_list = list(self.feeder)
+
+        # Validate that only the first row is processed
+        assert len(metadata_list) == 1
+        assert metadata_list[0].metadata.get("url") == "https://example.com"
+
+
+
+# TODO
+
+# Test two sheets
+# test two sheets with different columns
+# test folder implementation
diff --git a/tests/feeders/test_gworksheet.py b/tests/feeders/test_gworksheet.py
new file mode 100644
index 0000000..e6f5cc6
--- /dev/null
+++ b/tests/feeders/test_gworksheet.py
@@ -0,0 +1,144 @@
+import pytest
+from unittest.mock import MagicMock
+
+from auto_archiver.modules.gsheet_feeder import GWorksheet
+
+
+class TestGWorksheet:
+    @pytest.fixture
+    def mock_worksheet(self):
+        mock_ws = MagicMock()
+        mock_ws.get_values.return_value = [
+            ["Link", "Archive Status", "Archive Location", "Archive Date"],
+            ["url1", "archived", "filepath1", "2023-01-01"],
+            ["url2", "pending", "filepath2", "2023-01-02"],
+        ]
+        return mock_ws
+
+    @pytest.fixture
+    def gworksheet(self, mock_worksheet):
+        return GWorksheet(mock_worksheet)
+
+    # Test initialization and basic properties
+    def test_initialization_sets_headers(self, gworksheet):
+        assert gworksheet.headers == ["link", "archive status", "archive location", "archive date"]
+
+    def test_count_rows_returns_correct_value(self, gworksheet):
+        # inc header row
+        assert gworksheet.count_rows() == 3
+
+    # Test column validation and lookup
+    @pytest.mark.parametrize(
+        "col,expected_index",
+        [
+            ("url", 0),
+            ("status", 1),
+            ("archive", 2),
+            ("date", 3),
+        ],
+    )
+    def test_col_index_returns_correct_index(self, gworksheet, col, expected_index):
+        assert gworksheet._col_index(col) == expected_index
+
+    def test_check_col_exists_raises_for_invalid_column(self, gworksheet):
+        with pytest.raises(Exception, match="Column invalid_col"):
+            gworksheet._check_col_exists("invalid_col")
+
+    # Test data retrieval
+    @pytest.mark.parametrize(
+        "row,expected",
+        [
+            (1, ["Link", "Archive Status", "Archive Location", "Archive Date"]),
+            (2, ["url1", "archived", "filepath1", "2023-01-01"]),
+            (3, ["url2", "pending", "filepath2", "2023-01-02"]),
+        ],
+    )
+    def test_get_row_returns_correct_data(self, gworksheet, row, expected):
+        assert gworksheet.get_row(row) == expected
+
+    @pytest.mark.parametrize(
+        "row,col,expected",
+        [
+            (2, "url", "url1"),
+            (2, "status", "archived"),
+            (3, "date", "2023-01-02"),
+        ],
+    )
+    def test_get_cell_returns_correct_value(self, gworksheet, row, col, expected):
+        assert gworksheet.get_cell(row, col) == expected
+
+    def test_get_cell_handles_fresh_data(self, mock_worksheet, gworksheet):
+        mock_worksheet.cell.return_value.value = "fresh_value"
+        result = gworksheet.get_cell(2, "url", fresh=True)
+        assert result == "fresh_value"
+        mock_worksheet.cell.assert_called_once_with(2, 1)
+
+    # Test edge cases and error handling
+    @pytest.mark.parametrize(
+        "when_empty,expected",
+        [
+            (True, "default"),
+            (False, ""),
+        ],
+    )
+    def test_get_cell_or_default_handles_empty_values(
+        self, mock_worksheet, when_empty, expected
+    ):
+        mock_worksheet.get_values.return_value[1][0] = ""  # Empty URL cell
+        g = GWorksheet(mock_worksheet)
+        assert (
+            g.get_cell_or_default(
+                2, "url", default="default", when_empty_use_default=when_empty
+            )
+            == expected
+        )
+
+    def test_get_cell_or_default_handles_missing_columns(self, gworksheet):
+        assert (
+            gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
+        )
+
+    # Test write operations
+    def test_set_cell_updates_correct_position(self, mock_worksheet, gworksheet):
+        gworksheet.set_cell(2, "url", "new_url")
+        mock_worksheet.update_cell.assert_called_once_with(2, 1, "new_url")
+
+    def test_batch_set_cell_formats_requests_correctly(
+        self, mock_worksheet, gworksheet
+    ):
+        updates = [(2, "url", "new_url"), (3, "status", "processed")]
+        gworksheet.batch_set_cell(updates)
+        expected_batch = [
+            {"range": "A2", "values": [["new_url"]]},
+            {"range": "B3", "values": [["processed"]]},
+        ]
+        mock_worksheet.batch_update.assert_called_once_with(
+            expected_batch, value_input_option="USER_ENTERED"
+        )
+
+    def test_batch_set_cell_truncates_long_values(self, mock_worksheet, gworksheet):
+        long_value = "x" * 50000
+        gworksheet.batch_set_cell([(1, "url", long_value)])
+        submitted_value = mock_worksheet.batch_update.call_args[0][0][0]["values"][0][0]
+        assert len(submitted_value) == 49999
+
+    # Test coordinate conversion
+    @pytest.mark.parametrize(
+        "row,col,expected",
+        [
+            (1, "url", "A1"),
+            (2, "status", "B2"),
+            (3, "archive", "C3"),
+            (4, "date", "D4"),
+        ],
+    )
+    def test_to_a1_conversion(self, gworksheet, row, col, expected):
+        assert gworksheet.to_a1(row, col) == expected
+
+    # Test empty worksheet
+    def test_empty_worksheet_initialization(self):
+        mock_ws = MagicMock()
+        mock_ws.get_values.return_value = []
+        g = GWorksheet(mock_ws)
+        assert g.headers == []
+        assert g.count_rows() == 0
diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py
new file mode 100644
index 0000000..df1c1f1
--- /dev/null
+++ b/tests/storages/test_S3_storage.py
@@ -0,0 +1,100 @@
+from typing import Type
+import pytest
+from unittest.mock import MagicMock, patch, mock_open
+from auto_archiver.core import Media
+from auto_archiver.modules.s3_storage import s3_storage
+from tests.storages.test_storage_base import TestStorageBase
+
+
+class TestGDriveStorage:
+    """
+    Test suite for GDriveStorage.
+    """
+    module_name: str = "s3_storage"
+    storage: Type[s3_storage]
+    s3: MagicMock
+    config: dict = {
+        "path_generator": "flat",
+        "filename_generator": "static",
+        "bucket": "test-bucket",
+        "region": "test-region",
+        "key": "test-key",
+        "secret": "test-secret",
+        "random_no_duplicate": False,
+        "endpoint_url": "https://{region}.example.com",
+        "cdn_url": "https://cdn.example.com/{key}",
+        "private": False,
+    }
+
+    @patch('boto3.client')
+    @pytest.fixture(autouse=True)
+    def setup_storage(self, setup_module):
+        self.storage = setup_module(self.module_name, self.config)
+        self.storage.initialise()
+
+    @patch('boto3.client')
+    def test_client_initialization(self, mock_boto_client, setup_module):
+        """Test that S3 client is initialized with correct parameters"""
+        self.storage.initialise()
+        mock_boto_client.assert_called_once_with(
+            's3',
+            region_name='test-region',
+            endpoint_url='https://test-region.example.com',
+            aws_access_key_id='test-key',
+            aws_secret_access_key='test-secret'
+        )
+
+    def test_get_cdn_url_generation(self):
+        """Test CDN URL formatting """
+        media = Media("test.txt")
+        media.key = "path/to/file.txt"
+        url = self.storage.get_cdn_url(media)
+        assert url == "https://cdn.example.com/path/to/file.txt"
+        media.key = "another/path.jpg"
+        assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
+
+
+    @patch.object(s3_storage.S3Storage, 'file_in_folder')
+    def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder):
+        """Test that upload skips when file_in_folder finds existing object"""
+        # Setup test-specific configuration
+        self.storage.random_no_duplicate = True
+        mock_file_in_folder.return_value = "existing_folder/existing_file.txt"
+        # Create test media with calculated hash
+        media = Media("test.txt")
+        media.key = "original_path.txt"
+
+        # Mock hash calculation
+        with patch.object(self.storage, 'calculate_hash') as mock_calculate_hash:
+            mock_calculate_hash.return_value = "testhash123"
+            # Verify upload
+            assert self.storage.is_upload_needed(media) is False
+            assert media.key == "existing_folder/existing_file.txt"
+            assert media.get("previously archived") is True
+
+            with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload:
+                result = self.storage.uploadf(None, media)
+                mock_upload.assert_not_called()
+                assert result is True
+
+    @patch.object(s3_storage.S3Storage, 'is_upload_needed')
+    def test_uploads_with_correct_parameters(self, mock_upload_needed):
+        media = Media("test.txt")
+        mock_upload_needed.return_value = True
+        media.mimetype = 'image/png'
+        mock_file = MagicMock()
+
+        with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload:
+            self.storage.uploadf(mock_file, media)
+
+            # Verify core upload parameters
+            mock_upload.assert_called_once_with(
+                mock_file,
+                Bucket='test-bucket',
+                # Key='original_key.txt',
+                Key=None,
+                ExtraArgs={
+                    'ACL': 'public-read',
+                    'ContentType': 'image/png'
+                }
+            )
\ No newline at end of file
diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py
new file mode 100644
index 0000000..b7417ad
--- /dev/null
+++ b/tests/storages/test_gdrive_storage.py
@@ -0,0 +1,43 @@
+from typing import Type
+import pytest
+from unittest.mock import MagicMock, patch
+from auto_archiver.core import Media
+from auto_archiver.modules.gdrive_storage import GDriveStorage
+from auto_archiver.core.metadata import Metadata
+from tests.storages.test_storage_base import TestStorageBase
+
+
+class TestGDriveStorage(TestStorageBase):
+    """
+    Test suite for GDriveStorage.
+    """
+
+    module_name: str = "gdrive_storage"
+    storage: Type[GDriveStorage]
+    config: dict = {'path_generator': 'url',
+            'filename_generator': 'static',
+            'root_folder_id': "fake_root_folder_id",
+            'oauth_token': None,
+            'service_account': 'fake_service_account.json'
+                    }
+
+    @pytest.mark.skip(reason="Requires real credentials")
+    @pytest.mark.download
+    def test_initialize_with_real_credentials(self):
+        """
+        Test that the Google Drive service can be initialized with real credentials.
+        """
+        self.storage.service_account = 'secrets/service_account.json'  # Path to real credentials
+        self.storage.initialise()
+        assert self.storage.service is not None
+
+
+    def test_initialize_fails_with_non_existent_creds(self):
+        """
+        Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist.
+        """
+        # Act and Assert
+        with pytest.raises(FileNotFoundError) as exc_info:
+            self.storage.initialise()
+        assert "No such file or directory" in str(exc_info.value)
+
diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py
new file mode 100644
index 0000000..50d8846
--- /dev/null
+++ b/tests/storages/test_storage_base.py
@@ -0,0 +1,23 @@
+from typing import Type
+
+import pytest
+
+from auto_archiver.core.context import ArchivingContext
+from auto_archiver.core.metadata import Metadata
+from auto_archiver.core.storage import Storage
+
+
+class TestStorageBase(object):
+
+    module_name: str = None
+    config: dict = None
+
+    @pytest.fixture(autouse=True)
+    def setup_storage(self, setup_module):
+        assert (
+            self.module_name is not None
+        ), "self.module_name must be set on the subclass"
+        assert self.config is not None, "self.config must be a dict set on the subclass"
+        self.storage: Type[Storage] = setup_module(
+            self.module_name, self.config
+        )

From 5b0bad832f0bcf787979f18c5b8027f10b95b0a6 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Thu, 6 Feb 2025 10:11:56 +0000
Subject: [PATCH 02/17] Updated test, test metadata

---
 .../modules/gsheet_db/gsheet_db.py            |   1 -
 .../modules/gsheet_feeder/gsheet_feeder.py    |  59 ++++---
 .../test_instagram_api_extractor.py           |  89 +++++++++-
 tests/feeders/test_gsheet_feeder.py           |  10 +-
 tests/test_metadata.py                        | 161 ++++++++++++++++++
 5 files changed, 284 insertions(+), 36 deletions(-)
 create mode 100644 tests/test_metadata.py

diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
index 644015e..3bb27b7 100644
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -104,7 +104,6 @@ class GsheetsDb(Database):
         if gsheet := item.get_context("gsheet"):
             gw: GWorksheet = gsheet.get("worksheet")
             row: int = gsheet.get("row")
-        # todo doesn't exist, should be passed from
         elif self.sheet_id:
             print(self.sheet_id)
 
diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
index d129182..a51574e 100644
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -37,41 +37,48 @@ class GsheetsFeeder(Feeder):
 
     def __iter__(self) -> Metadata:
         sh = self.open_sheet()
-        for ii, wks in enumerate(sh.worksheets()):
-            if not self.should_process_sheet(wks.title):
-                logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
+        for ii, worksheet in enumerate(sh.worksheets()):
+            if not self.should_process_sheet(worksheet.title):
+                logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
                 continue
-
-            logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}')
-            gw = GWorksheet(wks, header_row=self.header, columns=self.columns)
-
+            logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
+            gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
             if len(missing_cols := self.missing_required_columns(gw)):
                 logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
                 continue
 
-            for row in range(1 + self.header, gw.count_rows() + 1):
-                url = gw.get_cell(row, 'url').strip()
-                if not len(url): continue
+            # process and yield metadata here:
+            yield from self._process_rows(gw)
+            logger.success(f'Finished worksheet {worksheet.title}')
 
-                original_status = gw.get_cell(row, 'status')
-                status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
-                # TODO: custom status parser(?) aka should_retry_from_status
-                if status not in ['', None]: continue
+    def _process_rows(self, gw: GWorksheet) -> Metadata:
+        for row in range(1 + self.header, gw.count_rows() + 1):
+            url = gw.get_cell(row, 'url').strip()
+            if not len(url): continue
+            original_status = gw.get_cell(row, 'status')
+            status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
+            # TODO: custom status parser(?) aka should_retry_from_status
+            if status not in ['', None]: continue
 
-                # All checks done - archival process starts here
-                m = Metadata().set_url(url)
-                if gw.get_cell_or_default(row, 'folder', "") is None:
-                    folder = ''
-                else:
-                    folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
-                if len(folder) and self.use_sheet_names_in_stored_paths:
-                    folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
+            # All checks done - archival process starts here
+            m = Metadata().set_url(url)
+            self._set_context(m, gw, row)
+            yield m
 
-                m.set_context('folder', folder)
-                m.set_context('worksheet', {"row": row, "worksheet": gw})
-                yield m
+    def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
+        # TODO: Check folder value not being recognised
+        m.set_context("gsheet", {"row": row, "worksheet": gw})
+
+        if gw.get_cell_or_default(row, 'folder', "") is None:
+            folder = ''
+        else:
+            folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
+        if len(folder):
+            if self.use_sheet_names_in_stored_paths:
+                m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
+            else:
+                m.set_context("folder", folder)
 
-            logger.success(f'Finished worksheet {wks.title}')
 
     def should_process_sheet(self, sheet_name: str) -> bool:
         if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py
index 7a19233..d3f7bd6 100644
--- a/tests/extractors/test_instagram_api_extractor.py
+++ b/tests/extractors/test_instagram_api_extractor.py
@@ -9,6 +9,7 @@ from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor impor
 from .test_extractor_base import TestExtractorBase
 
 
+
 @pytest.fixture
 def mock_user_response():
     return {
@@ -71,11 +72,18 @@ class TestInstagramAPIExtractor(TestExtractorBase):
     config = {
         "access_token": "test_access_token",
         "api_endpoint": "https://api.instagram.com/v1",
-        # "full_profile": False,
+        "full_profile": False,
         # "full_profile_max_posts": 0,
         # "minimize_json_output": True,
     }
 
+    @pytest.fixture
+    def metadata(self):
+        m = Metadata()
+        m.set_url("https://instagram.com/test_user")
+        m.set("netloc", "instagram.com")
+        return m
+
     @pytest.mark.parametrize("url,expected", [
         ("https://instagram.com/user", [("", "user", "")]),
         ("https://instagr.am/p/post_id", []),
@@ -88,7 +96,6 @@ class TestInstagramAPIExtractor(TestExtractorBase):
         assert self.extractor.valid_url.findall(url) == expected
 
     def test_initialize(self):
-        self.extractor.initialise()
         assert self.extractor.api_endpoint[-1] != "/"
 
     @pytest.mark.parametrize("input_dict,expected", [
@@ -98,11 +105,85 @@ class TestInstagramAPIExtractor(TestExtractorBase):
     def test_cleanup_dict(self, input_dict, expected):
         assert self.extractor.cleanup_dict(input_dict) == expected
 
-    def test_download_post(self):
+    def test_download(self):
+        pass
+
+    def test_download_post(self, metadata, mock_user_response):
         # test with context=reel
         # test with context=post
         # test with multiple images
         # test gets text (metadata title)
+        pass
 
+    def test_download_profile_basic(self, metadata, mock_user_response):
+        """Test basic profile download without full_profile"""
+        with patch.object(self.extractor, 'call_api') as mock_call, \
+                patch.object(self.extractor, 'download_from_url') as mock_download:
+            # Mock API responses
+            mock_call.return_value = mock_user_response
+            mock_download.return_value = "profile.jpg"
 
-        pass
\ No newline at end of file
+            result = self.extractor.download_profile(metadata, "test_user")
+            assert result.status == "insta profile: success"
+            assert result.get_title() == "Test User"
+            assert result.get("data") == self.extractor.cleanup_dict(mock_user_response["user"])
+            # Verify profile picture download
+            mock_call.assert_called_once_with("v2/user/by/username", {"username": "test_user"})
+            mock_download.assert_called_once_with("http://example.com/profile.jpg")
+            assert len(result.media) == 1
+            assert result.media[0].filename == "profile.jpg"
+
+    def test_download_profile_full(self, metadata, mock_user_response, mock_story_response):
+        """Test full profile download with stories/posts"""
+        with patch.object(self.extractor, 'call_api') as mock_call, \
+             patch.object(self.extractor, 'download_all_posts') as mock_posts, \
+             patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \
+             patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \
+             patch.object(self.extractor, '_download_stories_reusable') as mock_stories:
+
+            self.extractor.full_profile = True
+            mock_call.side_effect = [
+                mock_user_response,
+                mock_story_response
+            ]
+            mock_highlights.return_value = None
+            mock_stories.return_value = mock_story_response
+            mock_posts.return_value = None
+            mock_tagged.return_value = None
+
+            result = self.extractor.download_profile(metadata, "test_user")
+            assert result.get("#stories") == len(mock_story_response)
+            mock_posts.assert_called_once_with(result, "123")
+            assert "errors" not in result.metadata
+
+    def test_download_profile_not_found(self, metadata):
+        """Test profile not found error"""
+        with patch.object(self.extractor, 'call_api') as mock_call:
+            mock_call.return_value = {"user": None}
+            with pytest.raises(AssertionError) as exc_info:
+                self.extractor.download_profile(metadata, "invalid_user")
+            assert "User invalid_user not found" in str(exc_info.value)
+
+    def test_download_profile_error_handling(self, metadata, mock_user_response):
+        """Test error handling in full profile mode"""
+        with (patch.object(self.extractor, 'call_api') as mock_call, \
+                patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \
+                patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \
+                patch.object(self.extractor, '_download_stories_reusable') as stories_tagged, \
+                patch.object(self.extractor, 'download_all_posts') as mock_posts
+              ):
+            self.extractor.full_profile = True
+            mock_call.side_effect = [
+                mock_user_response,
+                Exception("Stories API failed"),
+                Exception("Posts API failed")
+            ]
+            mock_highlights.return_value = None
+            mock_tagged.return_value = None
+            stories_tagged.return_value = None
+            mock_posts.return_value = None
+            result = self.extractor.download_profile(metadata, "test_user")
+
+            assert result.is_success()
+            assert "Error downloading stories for test_user" in result.metadata["errors"]
+            # assert "Error downloading posts for test_user" in result.metadata["errors"]
\ No newline at end of file
diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py
index dbd2416..62380f5 100644
--- a/tests/feeders/test_gsheet_feeder.py
+++ b/tests/feeders/test_gsheet_feeder.py
@@ -4,7 +4,7 @@ import gspread
 import pytest
 from unittest.mock import patch, MagicMock
 from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
-from auto_archiver.core import Metadata, Feeder, ArchivingContext
+from auto_archiver.core import Metadata, Feeder
 
 
 def test_initialise_without_sheet_and_sheet_id(setup_module):
@@ -100,21 +100,21 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
 
 def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet):
     gsheet_feeder._set_context(worksheet, 1)
-    assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet}
+    assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
 
 
 @pytest.mark.skip(reason="Not recognising folder column")
 def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet):
     gsheet_feeder._set_context(worksheet, 7)
-    assert ArchivingContext.get("gsheet") == {"row": 1, "worksheet": worksheet}
+    assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
 
 
 def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
     testworksheet = TestWorksheet()
     testworksheet.wks.title = "TestSheet"
     gsheet_feeder._set_context(testworksheet, 6)
-    assert ArchivingContext.get("gsheet") == {"row": 6, "worksheet": testworksheet}
-    assert ArchivingContext.get("folder") == "some-folder/test-auto-archiver/testsheet"
+    assert Metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet}
+    assert Metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet"
 
 
 @pytest.mark.usefixtures("setup_module")
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
new file mode 100644
index 0000000..7270c80
--- /dev/null
+++ b/tests/test_metadata.py
@@ -0,0 +1,161 @@
+import pytest
+from datetime import datetime, timezone
+from dataclasses import dataclass
+from typing import Any
+from auto_archiver.core.metadata import Metadata
+
+
+@pytest.fixture
+def basic_metadata():
+    m = Metadata()
+    m.set_url("https://example.com")
+    m.set("title", "Test Page")
+    return m
+
+
+@dataclass
+class MockMedia:
+    filename: str = ""
+    mimetype: str = ""
+    data: dict = None
+
+    def get(self, key: str, default: Any = None) -> Any:
+        return self.data.get(key, default) if self.data else default
+
+    def set(self, key: str, value: Any) -> None:
+        if not self.data:
+            self.data = {}
+        self.data[key] = value
+
+
+@pytest.fixture
+def media_file():
+    def _create(filename="test.txt", mimetype="text/plain", hash_value=None):
+        m = MockMedia(filename=filename, mimetype=mimetype)
+        if hash_value:
+            m.set("hash", hash_value)
+        return m
+
+    return _create
+
+
+def test_initial_state():
+    m = Metadata()
+    assert m.status == "no archiver"
+    assert m.metadata == {"_processed_at": m.get("_processed_at")}
+    assert m.media == []
+    assert isinstance(m.get("_processed_at"), datetime)
+
+
+def test_url_properties(basic_metadata):
+    assert basic_metadata.get_url() == "https://example.com"
+    assert basic_metadata.netloc == "example.com"
+
+
+def test_simple_merge(basic_metadata):
+    right = Metadata(status="success")
+    right.set("title", "Test Title")
+
+    basic_metadata.merge(right)
+    assert basic_metadata.status == "success"
+    assert basic_metadata.get("title") == "Test Title"
+
+
+def test_left_merge():
+    left = (
+        Metadata()
+        .set("tags", ["a"])
+        .set("stats", {"views": 10})
+        .set("status", "success")
+    )
+    right = (
+        Metadata()
+        .set("tags", ["b"])
+        .set("stats", {"likes": 5})
+        .set("status", "no archiver")
+    )
+
+    left.merge(right, overwrite_left=True)
+    assert left.get("status") == "no archiver"
+    assert left.get("tags") == ["a", "b"]
+    assert left.get("stats") == {"views": 10, "likes": 5}
+
+
+def test_media_management(basic_metadata, media_file):
+    media1 = media_file(hash_value="abc")
+    media2 = media_file(hash_value="abc")  # Duplicate
+    media3 = media_file(hash_value="def")
+
+    basic_metadata.add_media(media1, "m1")
+    basic_metadata.add_media(media2, "m2")
+    basic_metadata.add_media(media3)
+
+    assert len(basic_metadata.media) == 3
+    basic_metadata.remove_duplicate_media_by_hash()
+    assert len(basic_metadata.media) == 2
+    assert basic_metadata.get_media_by_id("m1") == media1
+
+
+def test_success():
+    m = Metadata()
+    assert not m.is_success()
+    m.success("context")
+    assert m.is_success()
+    assert m.status == "context: success"
+
+
+def test_is_empty():
+    m = Metadata()
+    assert m.is_empty()
+    # meaningless ids
+    (
+        m.set("url", "example.com")
+        .set("total_bytes", 100)
+        .set("archive_duration_seconds", 10)
+        .set("_processed_at", datetime.now(timezone.utc))
+    )
+    assert m.is_empty()
+
+
+def test_store():
+    pass
+
+# Test Media operations
+
+
+# Test custom getter/setters
+
+
+def test_get_set_url():
+    m = Metadata()
+    m.set_url("http://example.com")
+    assert m.get_url() == "http://example.com"
+    with pytest.raises(AssertionError):
+        m.set_url("")
+    assert m.get("url") == "http://example.com"
+
+
+def test_set_content():
+    m = Metadata()
+    m.set_content("Some content")
+    assert m.get("content") == "Some content"
+    # Test appending
+    m.set_content("New content")
+    # Do we want to add a line break to the method?
+    assert m.get("content") == "Some contentNew content"
+
+
+def test_choose_most_complex():
+    pass
+
+
+def test_get_context():
+    m = Metadata()
+    m.set_context("somekey", "somevalue")
+    assert m.get_context("somekey") == "somevalue"
+    assert m.get_context("nonexistent") is None
+    m.set_context("anotherkey", "anothervalue")
+    # check the previous is retained
+    assert m.get_context("somekey") == "somevalue"
+    assert m.get_context("anotherkey") == "anothervalue"
+    assert len(m._context) == 2

From 266c7a14e6606cfd1c478cb4ed0ece602646035d Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Thu, 6 Feb 2025 16:53:00 +0000
Subject: [PATCH 03/17] Context related fixes, some more tests.

---
 .../modules/gsheet_feeder/gsheet_feeder.py    |   4 +-
 .../modules/s3_storage/__manifest__.py        |   3 +-
 .../modules/s3_storage/s3_storage.py          |   6 +-
 src/auto_archiver/utils/gsheet.py             |  53 -----
 tests/enrichers/test_meta_enricher.py         | 103 +++++++++
 .../test_instagram_tbot_extractor.py          |  88 +++----
 tests/feeders/test_gsheet_feeder.py           | 216 +++++++++---------
 tests/storages/test_S3_storage.py             | 123 ++++++++--
 tests/storages/test_storage_base.py           |   1 -
 9 files changed, 370 insertions(+), 227 deletions(-)
 delete mode 100644 src/auto_archiver/utils/gsheet.py
 create mode 100644 tests/enrichers/test_meta_enricher.py

diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
index a51574e..50bf430 100644
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -44,14 +44,14 @@ class GsheetsFeeder(Feeder):
             logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
             gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
             if len(missing_cols := self.missing_required_columns(gw)):
-                logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
+                logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
                 continue
 
             # process and yield metadata here:
             yield from self._process_rows(gw)
             logger.success(f'Finished worksheet {worksheet.title}')
 
-    def _process_rows(self, gw: GWorksheet) -> Metadata:
+    def _process_rows(self, gw: GWorksheet):
         for row in range(1 + self.header, gw.count_rows() + 1):
             url = gw.get_cell(row, 'url').strip()
             if not len(url): continue
diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py
index df05055..bf032e7 100644
--- a/src/auto_archiver/modules/s3_storage/__manifest__.py
+++ b/src/auto_archiver/modules/s3_storage/__manifest__.py
@@ -3,7 +3,7 @@
     "type": ["storage"],
     "requires_setup": True,
     "dependencies": {
-        "python": ["boto3", "loguru"],
+        "python": ["hash_enricher", "boto3", "loguru"],
     },
     "configs": {
         "path_generator": {
@@ -49,5 +49,6 @@
     - Requires S3 credentials (API key and secret) and a bucket name to function.
     - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
     - Uses `boto3` for interaction with the S3 API.
+    - Depends on the `HashEnricher` module for hash calculation.
     """
 }
diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py
index f324d5c..0c0e275 100644
--- a/src/auto_archiver/modules/s3_storage/s3_storage.py
+++ b/src/auto_archiver/modules/s3_storage/s3_storage.py
@@ -9,10 +9,11 @@ from auto_archiver.core import Media
 from auto_archiver.core import Storage
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
+from auto_archiver.core.module import get_module
 
 NO_DUPLICATES_FOLDER = "no-dups/"
 
-class S3Storage(Storage, HashEnricher):
+class S3Storage(Storage):
 
     def setup(self, config: dict) -> None:
         super().setup(config)
@@ -49,7 +50,8 @@ class S3Storage(Storage, HashEnricher):
     def is_upload_needed(self, media: Media) -> bool:
         if self.random_no_duplicate:
             # checks if a folder with the hash already exists, if so it skips the upload
-            hd = self.calculate_hash(media.filename)
+            he = get_module('hash_enricher', self.config)
+            hd = he.calculate_hash(media.filename)
             path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
 
             if existing_key:=self.file_in_folder(path):
diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py
deleted file mode 100644
index 7a8862f..0000000
--- a/src/auto_archiver/utils/gsheet.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import json, gspread
-
-from ..core import BaseModule
-
-
-class Gsheets(BaseModule):
-    name = "gsheets"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.gsheets_client = gspread.service_account(filename=self.service_account)
-        # TODO: config should be responsible for conversions
-        try: self.header = int(self.header)
-        except: pass
-        assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
-        assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
-
-    # TODO merge this into gsheets processors manifest
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "sheet": {"default": None, "help": "name of the sheet to archive"},
-            "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
-            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
-            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
-            "columns": {
-                "default": {
-                    'url': 'link',
-                    'status': 'archive status',
-                    'folder': 'destination folder',
-                    'archive': 'archive location',
-                    'date': 'archive date',
-                    'thumbnail': 'thumbnail',
-                    'timestamp': 'upload timestamp',
-                    'title': 'upload title',
-                    'text': 'text content',
-                    'screenshot': 'screenshot',
-                    'hash': 'hash',
-                    'pdq_hash': 'perceptual hashes',
-                    'wacz': 'wacz',
-                    'replaywebpage': 'replaywebpage',
-                },
-                "help": "names of columns in the google sheet (stringified JSON object)",
-                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
-            },
-        }
-
-    def open_sheet(self):
-        if self.sheet:
-            return self.gsheets_client.open(self.sheet)
-        else:  # self.sheet_id
-            return self.gsheets_client.open_by_key(self.sheet_id)
diff --git a/tests/enrichers/test_meta_enricher.py b/tests/enrichers/test_meta_enricher.py
new file mode 100644
index 0000000..a09aaa9
--- /dev/null
+++ b/tests/enrichers/test_meta_enricher.py
@@ -0,0 +1,103 @@
+import datetime
+from datetime import datetime, timedelta, timezone
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from auto_archiver.core import Metadata, Media
+from auto_archiver.modules.meta_enricher import MetaEnricher
+
+
+@pytest.fixture
+def mock_metadata():
+    """Creates a mock Metadata object."""
+    mock: Metadata = MagicMock(spec=Metadata)
+    mock.get_url.return_value = "https://example.com"
+    mock.is_empty.return_value = False  # Default to not empty
+    mock.get_all_media.return_value = []
+    return mock
+
+@pytest.fixture
+def mock_media():
+    """Creates a mock Media object."""
+    mock: Media = MagicMock(spec=Media)
+    mock.filename = "mock_file.txt"
+    return mock
+
+@pytest.fixture
+def metadata():
+    m = Metadata()
+    m.set_url("https://example.com")
+    m.set_title("Test Title")
+    m.set_content("Test Content")
+    return m
+
+
+@pytest.fixture(autouse=True)
+def meta_enricher(setup_module):
+    return setup_module(MetaEnricher, {})
+
+
+def test_enrich_skips_empty_metadata(meta_enricher, mock_metadata):
+    """Test that enrich() does nothing when Metadata is empty."""
+    mock_metadata.is_empty.return_value = True
+    meta_enricher.enrich(mock_metadata)
+    mock_metadata.get_url.assert_called_once()
+
+
+def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):
+    """Test that enrich_file_sizes() calculates and sets file sizes correctly."""
+    file1 = tmp_path / "testfile_1.txt"
+    file2 = tmp_path / "testfile_2.txt"
+    file1.write_text("A" * 1000)
+    file2.write_text("B" * 2000)
+    metadata.add_media(Media(str(file1)))
+    metadata.add_media(Media(str(file2)))
+
+    meta_enricher.enrich_file_sizes(metadata)
+
+    # Verify individual media file sizes
+    media1 = metadata.get_all_media()[0]
+    media2 = metadata.get_all_media()[1]
+
+    assert media1.get("bytes") == 1000
+    assert media1.get("size") == "1000.0 bytes"
+    assert media2.get("bytes") == 2000
+    assert media2.get("size") == "2.0 KB"
+
+    assert metadata.get("total_bytes") == 3000
+    assert metadata.get("total_size") == "2.9 KB"
+
+@pytest.mark.parametrize(
+    "size, expected",
+    [
+        (500, "500.0 bytes"),
+        (1024, "1.0 KB"),
+        (2048, "2.0 KB"),
+        (1048576, "1.0 MB"),
+        (1073741824, "1.0 GB"),
+    ],
+)
+def test_human_readable_bytes(size, expected):
+    """Test that human_readable_bytes() converts sizes correctly."""
+    enricher = MetaEnricher()
+    assert enricher.human_readable_bytes(size) == expected
+
+def test_enrich_file_sizes_no_media(meta_enricher, metadata):
+    """Test that enrich_file_sizes() handles empty media list gracefully."""
+    meta_enricher.enrich_file_sizes(metadata)
+    assert metadata.get("total_bytes") == 0
+    assert metadata.get("total_size") == "0.0 bytes"
+
+
+def test_enrich_archive_duration(meta_enricher, metadata):
+    # Set fixed "processed at" time in the past
+    processed_at = datetime.now(timezone.utc) - timedelta(minutes=10, seconds=30)
+    metadata.set("_processed_at", processed_at)
+    # patch datetime
+    with patch("datetime.datetime") as mock_datetime:
+        mock_now = datetime.now(timezone.utc)
+        mock_datetime.now.return_value = mock_now
+        meta_enricher.enrich_archive_duration(metadata)
+
+    assert metadata.get("archive_duration_seconds") == 630
\ No newline at end of file
diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py
index 4fe80be..b82641d 100644
--- a/tests/extractors/test_instagram_tbot_extractor.py
+++ b/tests/extractors/test_instagram_tbot_extractor.py
@@ -5,15 +5,16 @@ from unittest.mock import patch, MagicMock
 
 import pytest
 
+from auto_archiver.core import Metadata
 from auto_archiver.core.extractor import Extractor
 from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor
-
+from tests.extractors.test_extractor_base import TestExtractorBase
 
 TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles")
 
 
 @pytest.fixture
-def test_session_file(tmpdir):
+def session_file(tmpdir):
     """Fixture to create a test session file."""
     session_file = os.path.join(tmpdir, "test_session.session")
     with open(session_file, "w") as f:
@@ -21,27 +22,34 @@ def test_session_file(tmpdir):
     return session_file.replace(".session", "")
 
 
-@pytest.mark.incremental
-class TestInstagramTbotExtractor(object):
-    """
-    Test suite for InstagramTbotExtractor.
-    """
+@pytest.fixture(autouse=True)
+def patch_extractor_methods(request, setup_module):
+    with patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None), \
+            patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None):
+        if hasattr(request, 'cls') and hasattr(request.cls, 'config'):
+            request.cls.extractor = setup_module("instagram_tbot_extractor", request.cls.config)
+
+        yield
+
+@pytest.fixture
+def metadata_sample():
+    m = Metadata()
+    m.set_title("Test Title")
+    m.set_timestamp("2021-01-01T00:00:00Z")
+    m.set_url("https://www.instagram.com/p/1234567890")
+    return m
+
+
+class TestInstagramTbotExtractor:
 
     extractor_module = "instagram_tbot_extractor"
     extractor: InstagramTbotExtractor
     config = {
         "api_id": 12345,
         "api_hash": "test_api_hash",
-        # "session_file"
+        "session_file": "test_session",
     }
 
-    @pytest.fixture(autouse=True)
-    def setup_extractor(self, setup_module):
-        assert self.extractor_module is not None, "self.extractor_module must be set on the subclass"
-        assert self.config is not None, "self.config must be a dict set on the subclass"
-        extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
-        return extractor
-
     @pytest.fixture
     def mock_telegram_client(self):
         """Fixture to mock TelegramClient interactions."""
@@ -50,22 +58,11 @@ class TestInstagramTbotExtractor(object):
             mock_client.return_value = instance
             yield instance
 
-
-    # @pytest.fixture
-    # def mock_session_file(self, temp_session_file):
-    #     """Patch the extractor’s session file setup to use a temporary path."""
-    #     with patch.object(InstagramTbotExtractor, "session_file", temp_session_file):
-    #         with patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None):
-    #             yield  # Mocks are applied for the duration of the test
-
-    @pytest.fixture
-    def metadata_sample(self):
-        """Loads a Metadata object from a pickle file."""
-        with open(os.path.join(TESTFILES, "metadata_item.pkl"), "rb") as f:
-            return pickle.load(f)
+    def test_extractor_is_initialized(self):
+        assert self.extractor is not None
 
 
-    @pytest.mark.download
+    @patch("time.sleep")
     @pytest.mark.parametrize("url, expected_status, bot_responses", [
         ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]),
         ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]),
@@ -74,32 +71,19 @@ class TestInstagramTbotExtractor(object):
         ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []),
         ("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]),
     ])
-    def test_download(self, url, expected_status, bot_responses, metadata_sample):
+    def test_download(self, mock_sleep, url, expected_status, bot_responses, metadata_sample):
         """Test the `download()` method with various Instagram URLs."""
         metadata_sample.set_url(url)
-        self.extractor.initialise()
+        self.extractor.client = MagicMock()
         result = self.extractor.download(metadata_sample)
-        if expected_status:
-            assert result.is_success()
-            assert result.status == expected_status
-            assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message]
-        else:
-            assert result is False
-        # self.extractor.cleanup()
-
-    # @patch.object(InstagramTbotExtractor, '_send_url_to_bot')
-    # @patch.object(InstagramTbotExtractor, '_process_messages')
-    # def test_download_invalid_link_returns_false(
-    #     self, mock_process, mock_send, extractor, metadata_instagram
-    # ):
-    #     # Setup Mocks
-    #     # _send_url_to_bot -> simulate it returns (chat=MagicMock, since_id=100)
-    #     mock_chat = MagicMock()
-    #     mock_send.return_value = (mock_chat, 100)
-    #     # _process_messages -> simulate it returns the text "You must enter a URL to a post"
-    #     mock_process.return_value = "You must enter a URL to a post"
-    #     result = extractor.download(metadata_instagram)
-    #     assert result is False, "Should return False if message includes 'You must enter a URL to a post'"
+        pass
+        # TODO fully mock or use as authenticated test
+        # if expected_status:
+        #     assert result.is_success()
+        #     assert result.status == expected_status
+        #     assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message]
+        # else:
+        #     assert result is False
 
 
 
diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py
index 62380f5..103610e 100644
--- a/tests/feeders/test_gsheet_feeder.py
+++ b/tests/feeders/test_gsheet_feeder.py
@@ -9,57 +9,52 @@ from auto_archiver.core import Metadata, Feeder
 
 def test_initialise_without_sheet_and_sheet_id(setup_module):
     """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set.
-        (shouldn't really be asserting in there)
+    (shouldn't really be asserting in there)
     """
     with patch("gspread.service_account"):
-        feeder = setup_module("gsheet_feeder",
-                              {"service_account": "dummy.json",
-                               "sheet": None,
-                               "sheet_id": None})
         with pytest.raises(AssertionError):
-            feeder.initialise()
+            setup_module(
+                "gsheet_feeder",
+                {"service_account": "dummy.json", "sheet": None, "sheet_id": None},
+            )
 
 
 @pytest.fixture
 def gsheet_feeder(setup_module) -> GsheetsFeeder:
-    feeder = setup_module("gsheet_feeder",
-                          {"service_account": "dummy.json",
-                           "sheet": "test-auto-archiver",
-                           "sheet_id": None,
-                           "header": 1,
-                           "columns": {
-                               "url": "link",
-                               "status": "archive status",
-                               "folder": "destination folder",
-                               "archive": "archive location",
-                               "date": "archive date",
-                               "thumbnail": "thumbnail",
-                               "timestamp": "upload timestamp",
-                               "title": "upload title",
-                               "text": "text content",
-                               "screenshot": "screenshot",
-                               "hash": "hash",
-                               "pdq_hash": "perceptual hashes",
-                               "wacz": "wacz",
-                               "replaywebpage": "replaywebpage",
-                           },
-                           "allow_worksheets": set(),
-                           "block_worksheets": set(),
-                           "use_sheet_names_in_stored_paths": True,
-                           }
-                          )
+    with patch("gspread.service_account"):
+        feeder = setup_module(
+            "gsheet_feeder",
+            {
+                "service_account": "dummy.json",
+                "sheet": "test-auto-archiver",
+                "sheet_id": None,
+                "header": 1,
+                "columns": {
+                    "url": "link",
+                    "status": "archive status",
+                    "folder": "destination folder",
+                    "archive": "archive location",
+                    "date": "archive date",
+                    "thumbnail": "thumbnail",
+                    "timestamp": "upload timestamp",
+                    "title": "upload title",
+                    "text": "text content",
+                    "screenshot": "screenshot",
+                    "hash": "hash",
+                    "pdq_hash": "perceptual hashes",
+                    "wacz": "wacz",
+                    "replaywebpage": "replaywebpage",
+                },
+                "allow_worksheets": set(),
+                "block_worksheets": set(),
+                "use_sheet_names_in_stored_paths": True,
+            },
+        )
     feeder.gsheets_client = MagicMock()
     return feeder
 
 
-@pytest.fixture()
-def worksheet(unpickle):
-    # Load the worksheet data from the pickle file
-    # only works for simple usage, cant reauthenticate but give structure
-    return unpickle("test_worksheet.pickle")
-
-
-class TestWorksheet():
+class TestWorksheet:
     """
     mimics the bits we need from gworksheet
     """
@@ -68,12 +63,17 @@ class TestWorksheet():
         title = "TestSheet"
 
     rows = [
-          { "row": 2, "url": "http://example.com", "status": "", "folder": "" },
-          { "row": 3, "url": "http://example.com", "status": "", "folder": "" },
-          { "row": 4, "url": "", "status": "", "folder": "" },
-          { "row": 5, "url": "https://another.com", "status": None, "folder": "" },
-          { "row": 6, "url": "https://another.com", "status": "success", "folder": "some_folder" },
-        ]
+        {"row": 2, "url": "http://example.com", "status": "", "folder": ""},
+        {"row": 3, "url": "http://example.com", "status": "", "folder": ""},
+        {"row": 4, "url": "", "status": "", "folder": ""},
+        {"row": 5, "url": "https://another.com", "status": None, "folder": ""},
+        {
+            "row": 6,
+            "url": "https://another.com",
+            "status": "success",
+            "folder": "some_folder",
+        },
+    ]
 
     def __init__(self):
         self.wks = self.SheetSheet()
@@ -91,6 +91,7 @@ class TestWorksheet():
         matching = next((r for r in self.rows if r["row"] == row), {})
         return matching.get(col_name, default)
 
+
 def test__process_rows(gsheet_feeder: GsheetsFeeder):
     testworksheet = TestWorksheet()
     metadata_items = list(gsheet_feeder._process_rows(testworksheet))
@@ -98,9 +99,12 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
     assert isinstance(metadata_items[0], Metadata)
     assert metadata_items[0].get("url") == "http://example.com"
 
-def test__set_metadata(gsheet_feeder: GsheetsFeeder, worksheet):
-    gsheet_feeder._set_context(worksheet, 1)
-    assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
+
+def test__set_metadata(gsheet_feeder: GsheetsFeeder):
+    worksheet = TestWorksheet()
+    metadata = Metadata()
+    gsheet_feeder._set_context(metadata, worksheet, 1)
+    assert metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
 
 
 @pytest.mark.skip(reason="Not recognising folder column")
@@ -111,18 +115,24 @@ def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, workshe
 
 def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
     testworksheet = TestWorksheet()
+    metadata = Metadata()
     testworksheet.wks.title = "TestSheet"
-    gsheet_feeder._set_context(testworksheet, 6)
-    assert Metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet}
-    assert Metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet"
+    gsheet_feeder._set_context(metadata, testworksheet, 6)
+    assert metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet}
+    assert metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet"
 
 
 @pytest.mark.usefixtures("setup_module")
-@pytest.mark.parametrize("sheet, sheet_id, expected_method, expected_arg, description", [
-    ("TestSheet", None, "open", "TestSheet", "opening by sheet name"),
-    (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID")
-])
-def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description):
+@pytest.mark.parametrize(
+    "sheet, sheet_id, expected_method, expected_arg, description",
+    [
+        ("TestSheet", None, "open", "TestSheet", "opening by sheet name"),
+        (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"),
+    ],
+)
+def test_open_sheet_with_name_or_id(
+    setup_module, sheet, sheet_id, expected_method, expected_arg, description
+):
     """Ensure open_sheet() correctly opens by name or ID based on configuration."""
     with patch("gspread.service_account") as mock_service_account:
         mock_client = MagicMock()
@@ -131,15 +141,16 @@ def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_meth
         mock_client.open_by_key.return_value = "MockSheet"
 
         # Setup module with parameterized values
-        feeder = setup_module("gsheet_feeder", {
-            "service_account": "dummy.json",
-            "sheet": sheet,
-            "sheet_id": sheet_id
-        })
+        feeder = setup_module(
+            "gsheet_feeder",
+            {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
+        )
         feeder.initialise()
         sheet_result = feeder.open_sheet()
         # Validate the correct method was called
-        getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}"
+        getattr(mock_client, expected_method).assert_called_once_with(
+            expected_arg
+        ), f"Failed: {description}"
         assert sheet_result == "MockSheet", f"Failed: {description}"
 
 
@@ -150,10 +161,10 @@ def test_open_sheet_with_sheet_id(setup_module):
         mock_client = MagicMock()
         mock_service_account.return_value = mock_client
         mock_client.open_by_key.return_value = "MockSheet"
-        feeder = setup_module("gsheet_feeder",
-                              {"service_account": "dummy.json",
-                               "sheet": None,
-                               "sheet_id": "ABC123"})
+        feeder = setup_module(
+            "gsheet_feeder",
+            {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
+        )
         feeder.initialise()
         sheet = feeder.open_sheet()
         mock_client.open_by_key.assert_called_once_with("ABC123")
@@ -161,47 +172,51 @@ def test_open_sheet_with_sheet_id(setup_module):
 
 
 def test_should_process_sheet(setup_module):
-    gdb = setup_module("gsheet_feeder", {"service_account": "dummy.json",
-                                        "sheet": "TestSheet",
-                                        "sheet_id": None,
-                                        "allow_worksheets": {"TestSheet", "Sheet2"},
-                                        "block_worksheets": {"Sheet3"}}
-                      )
+    with patch("gspread.service_account"):
+        gdb = setup_module(
+            "gsheet_feeder",
+            {
+                "service_account": "dummy.json",
+                "sheet": "TestSheet",
+                "sheet_id": None,
+                "allow_worksheets": {"TestSheet", "Sheet2"},
+                "block_worksheets": {"Sheet3"},
+            },
+        )
     assert gdb.should_process_sheet("TestSheet") == True
     assert gdb.should_process_sheet("Sheet3") == False
     # False if allow_worksheets is set
     assert gdb.should_process_sheet("AnotherSheet") == False
 
 
-
-@pytest.mark.skip
+# @pytest.mark.skip(reason="Requires a real connection")
 class TestGSheetsFeederReal:
+    """Testing GSheetsFeeder class"""
 
-    """ Testing GSheetsFeeder class  """
-    module_name: str = 'gsheet_feeder'
+    module_name: str = "gsheet_feeder"
     feeder: GsheetsFeeder
+    # You must follow the setup process explain in the docs for this to work
     config: dict = {
-        # TODO: Create test creds
         "service_account": "secrets/service_account.json",
         "sheet": "test-auto-archiver",
         "sheet_id": None,
         "header": 1,
         "columns": {
-                "url": "link",
-                "status": "archive status",
-                "folder": "destination folder",
-                "archive": "archive location",
-                "date": "archive date",
-                "thumbnail": "thumbnail",
-                "timestamp": "upload timestamp",
-                "title": "upload title",
-                "text": "text content",
-                "screenshot": "screenshot",
-                "hash": "hash",
-                "pdq_hash": "perceptual hashes",
-                "wacz": "wacz",
-                "replaywebpage": "replaywebpage",
-            },
+            "url": "link",
+            "status": "archive status",
+            "folder": "destination folder",
+            "archive": "archive location",
+            "date": "archive date",
+            "thumbnail": "thumbnail",
+            "timestamp": "upload timestamp",
+            "title": "upload title",
+            "text": "text content",
+            "screenshot": "screenshot",
+            "hash": "hash",
+            "pdq_hash": "perceptual hashes",
+            "wacz": "wacz",
+            "replaywebpage": "replaywebpage",
+        },
         "allow_worksheets": set(),
         "block_worksheets": set(),
         "use_sheet_names_in_stored_paths": True,
@@ -213,9 +228,7 @@ class TestGSheetsFeederReal:
             self.module_name is not None
         ), "self.module_name must be set on the subclass"
         assert self.config is not None, "self.config must be a dict set on the subclass"
-        self.feeder: Type[Feeder] = setup_module(
-            self.module_name, self.config
-        )
+        self.feeder: Type[Feeder] = setup_module(self.module_name, self.config)
 
     def reset_test_sheet(self):
         """Clears test sheet and re-adds headers to ensure consistent test results."""
@@ -225,19 +238,17 @@ class TestGSheetsFeederReal:
         worksheet.clear()
         worksheet.append_row(["Link", "Archive Status"])
 
-    def test_initialise(self):
-        self.feeder.initialise()
+    def test_setup(self):
         assert hasattr(self.feeder, "gsheets_client")
 
-    @pytest.mark.download
     def test_open_sheet_real_connection(self):
         """Ensure open_sheet() connects to a real Google Sheets instance."""
-        self.feeder.initialise()
         sheet = self.feeder.open_sheet()
         assert sheet is not None, "open_sheet() should return a valid sheet instance"
-        assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method"
+        assert hasattr(
+            sheet, "worksheets"
+        ), "Returned object should have worksheets method"
 
-    @pytest.mark.download
     def test_iter_yields_metadata_real_data(self):
         """Ensure __iter__() yields Metadata objects for real test sheet data."""
         self.reset_test_sheet()
@@ -260,7 +271,6 @@ class TestGSheetsFeederReal:
         assert metadata_list[0].metadata.get("url") == "https://example.com"
 
 
-
 # TODO
 
 # Test two sheets
diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py
index df1c1f1..60b40e6 100644
--- a/tests/storages/test_S3_storage.py
+++ b/tests/storages/test_S3_storage.py
@@ -1,9 +1,101 @@
 from typing import Type
 import pytest
-from unittest.mock import MagicMock, patch, mock_open
+from unittest.mock import MagicMock, patch, PropertyMock
 from auto_archiver.core import Media
+from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.modules.s3_storage import s3_storage
-from tests.storages.test_storage_base import TestStorageBase
+
+
+@patch('boto3.client')
+@pytest.fixture
+def s3_store(setup_module):
+    config: dict = {
+        "path_generator": "flat",
+        "filename_generator": "static",
+        "bucket": "test-bucket",
+        "region": "test-region",
+        "key": "test-key",
+        "secret": "test-secret",
+        "random_no_duplicate": False,
+        "endpoint_url": "https://{region}.example.com",
+        "cdn_url": "https://cdn.example.com/{key}",
+        "private": False,
+    }
+    s3_storage = setup_module("s3_storage", config)
+    return s3_storage
+
+def test_client_initialization(s3_store):
+    """Test that S3 client is initialized with correct parameters"""
+    assert s3_store.s3 is not None
+    assert s3_store.s3.meta.region_name == 'test-region'
+
+
+def test_get_cdn_url_generation(s3_store):
+    """Test CDN URL formatting """
+    media = Media("test.txt")
+    media.key = "path/to/file.txt"
+    url = s3_store.get_cdn_url(media)
+    assert url == "https://cdn.example.com/path/to/file.txt"
+    media.key = "another/path.jpg"
+    assert s3_store.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
+
+
+@patch.object(s3_storage.S3Storage, 'file_in_folder')
+def test_skips_upload_when_duplicate_exists(mock_file_in_folder, s3_store):
+    """Test that upload skips when file_in_folder finds existing object"""
+    # Setup test-specific configuration
+    s3_store.random_no_duplicate = True
+    mock_file_in_folder.return_value = "existing_folder/existing_file.txt"
+    # Create test media with calculated hash
+    media = Media("test.txt")
+    media.key = "original_path.txt"
+
+    # Mock hash calculation
+    with patch.object(s3_store, 'calculate_hash') as mock_calculate_hash:
+        mock_calculate_hash.return_value = "testhash123"
+        # Verify upload
+        assert s3_store.is_upload_needed(media) is False
+        assert media.key == "existing_folder/existing_file.txt"
+        assert media.get("previously archived") is True
+
+        with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload:
+            result = s3_store.uploadf(None, media)
+            mock_upload.assert_not_called()
+            assert result is True
+
+@patch.object(s3_storage.S3Storage, 'is_upload_needed')
+def test_uploads_with_correct_parameters(mock_upload_needed, s3_store):
+    media = Media("test.txt")
+    mock_upload_needed.return_value = True
+    media.mimetype = 'image/png'
+    mock_file = MagicMock()
+
+    with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload:
+        s3_store.uploadf(mock_file, media)
+
+        # Verify core upload parameters
+        mock_upload.assert_called_once_with(
+            mock_file,
+            Bucket='test-bucket',
+            # Key='original_key.txt',
+            Key=None,
+            ExtraArgs={
+                'ACL': 'public-read',
+                'ContentType': 'image/png'
+            }
+        )
+
+
+
+
+
+
+
+
+# ============================================================
+
+
+
 
 
 class TestGDriveStorage:
@@ -29,20 +121,13 @@ class TestGDriveStorage:
     @patch('boto3.client')
     @pytest.fixture(autouse=True)
     def setup_storage(self, setup_module):
+        he = HashEnricher()
         self.storage = setup_module(self.module_name, self.config)
-        self.storage.initialise()
 
-    @patch('boto3.client')
-    def test_client_initialization(self, mock_boto_client, setup_module):
+    def test_client_initialization(self, setup_storage):
         """Test that S3 client is initialized with correct parameters"""
-        self.storage.initialise()
-        mock_boto_client.assert_called_once_with(
-            's3',
-            region_name='test-region',
-            endpoint_url='https://test-region.example.com',
-            aws_access_key_id='test-key',
-            aws_secret_access_key='test-secret'
-        )
+        assert self.storage.s3 is not None
+        assert self.storage.s3.meta.region_name == 'test-region'
 
     def test_get_cdn_url_generation(self):
         """Test CDN URL formatting """
@@ -53,6 +138,18 @@ class TestGDriveStorage:
         media.key = "another/path.jpg"
         assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
 
+    def test_upload_decision_logic(self):
+        """Test is_upload_needed under different conditions"""
+        media = Media("test.txt")
+
+        # Test random_no_duplicate disabled
+        assert self.storage.is_upload_needed(media) is True
+
+        # Test duplicate exists
+        self.storage.random_no_duplicate = True
+        with patch.object(self.storage, 'file_in_folder', return_value='existing.txt'):
+            assert self.storage.is_upload_needed(media) is False
+            assert media.key == 'existing.txt'
 
     @patch.object(s3_storage.S3Storage, 'file_in_folder')
     def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder):
diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py
index 50d8846..7578acd 100644
--- a/tests/storages/test_storage_base.py
+++ b/tests/storages/test_storage_base.py
@@ -2,7 +2,6 @@ from typing import Type
 
 import pytest
 
-from auto_archiver.core.context import ArchivingContext
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.core.storage import Storage
 

From e9ad1e1b85dbea08354189e775ae4718b4ea52cb Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Thu, 6 Feb 2025 22:01:55 +0000
Subject: [PATCH 04/17] Pass media to storage cdn_call

---
 src/auto_archiver/core/media.py               |   2 +-
 .../modules/gdrive_storage/gdrive_storage.py  |  11 +-
 tests/storages/test_S3_storage.py             | 149 +++++-------------
 3 files changed, 49 insertions(+), 113 deletions(-)

diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py
index 2cb6fc9..952a025 100644
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -65,7 +65,7 @@ class Media:
 
     def is_stored(self, in_storage) -> bool:
         # checks if the media is already stored in the given storage
-        return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u])
+        return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url(self) in u])
 
     def set(self, key: str, value: Any) -> Media:
         self.properties[key] = value
diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
index b764f1d..cc9cf3d 100644
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -74,7 +74,8 @@ class GDriveStorage(Storage):
             parent_id = folder_id
 
         # get id of file inside folder (or sub folder)
-        file_id = self._get_id_from_parent_and_name(folder_id, filename)
+        # TODO: supressing the error as being checked before first upload
+        file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False)
         return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
 
     def upload(self, media: Media, **kwargs) -> bool:
@@ -106,7 +107,13 @@ class GDriveStorage(Storage):
     # must be implemented even if unused
     def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
 
-    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
+    def _get_id_from_parent_and_name(self, parent_id: str,
+                                     name: str,
+                                     retries: int = 1,
+                                     sleep_seconds: int = 10,
+                                     use_mime_type: bool = False,
+                                     raise_on_missing: bool = True,
+                                     use_cache=False):
         """
         Retrieves the id of a folder or file from its @name and the @parent_id folder
         Optionally does multiple @retries and sleeps @sleep_seconds between them
diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py
index 60b40e6..2594e73 100644
--- a/tests/storages/test_S3_storage.py
+++ b/tests/storages/test_S3_storage.py
@@ -1,103 +1,11 @@
 from typing import Type
 import pytest
-from unittest.mock import MagicMock, patch, PropertyMock
+from unittest.mock import MagicMock, patch
 from auto_archiver.core import Media
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.modules.s3_storage import s3_storage
 
 
-@patch('boto3.client')
-@pytest.fixture
-def s3_store(setup_module):
-    config: dict = {
-        "path_generator": "flat",
-        "filename_generator": "static",
-        "bucket": "test-bucket",
-        "region": "test-region",
-        "key": "test-key",
-        "secret": "test-secret",
-        "random_no_duplicate": False,
-        "endpoint_url": "https://{region}.example.com",
-        "cdn_url": "https://cdn.example.com/{key}",
-        "private": False,
-    }
-    s3_storage = setup_module("s3_storage", config)
-    return s3_storage
-
-def test_client_initialization(s3_store):
-    """Test that S3 client is initialized with correct parameters"""
-    assert s3_store.s3 is not None
-    assert s3_store.s3.meta.region_name == 'test-region'
-
-
-def test_get_cdn_url_generation(s3_store):
-    """Test CDN URL formatting """
-    media = Media("test.txt")
-    media.key = "path/to/file.txt"
-    url = s3_store.get_cdn_url(media)
-    assert url == "https://cdn.example.com/path/to/file.txt"
-    media.key = "another/path.jpg"
-    assert s3_store.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
-
-
-@patch.object(s3_storage.S3Storage, 'file_in_folder')
-def test_skips_upload_when_duplicate_exists(mock_file_in_folder, s3_store):
-    """Test that upload skips when file_in_folder finds existing object"""
-    # Setup test-specific configuration
-    s3_store.random_no_duplicate = True
-    mock_file_in_folder.return_value = "existing_folder/existing_file.txt"
-    # Create test media with calculated hash
-    media = Media("test.txt")
-    media.key = "original_path.txt"
-
-    # Mock hash calculation
-    with patch.object(s3_store, 'calculate_hash') as mock_calculate_hash:
-        mock_calculate_hash.return_value = "testhash123"
-        # Verify upload
-        assert s3_store.is_upload_needed(media) is False
-        assert media.key == "existing_folder/existing_file.txt"
-        assert media.get("previously archived") is True
-
-        with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload:
-            result = s3_store.uploadf(None, media)
-            mock_upload.assert_not_called()
-            assert result is True
-
-@patch.object(s3_storage.S3Storage, 'is_upload_needed')
-def test_uploads_with_correct_parameters(mock_upload_needed, s3_store):
-    media = Media("test.txt")
-    mock_upload_needed.return_value = True
-    media.mimetype = 'image/png'
-    mock_file = MagicMock()
-
-    with patch.object(s3_store.s3, 'upload_fileobj') as mock_upload:
-        s3_store.uploadf(mock_file, media)
-
-        # Verify core upload parameters
-        mock_upload.assert_called_once_with(
-            mock_file,
-            Bucket='test-bucket',
-            # Key='original_key.txt',
-            Key=None,
-            ExtraArgs={
-                'ACL': 'public-read',
-                'ContentType': 'image/png'
-            }
-        )
-
-
-
-
-
-
-
-
-# ============================================================
-
-
-
-
-
 class TestGDriveStorage:
     """
     Test suite for GDriveStorage.
@@ -121,10 +29,9 @@ class TestGDriveStorage:
     @patch('boto3.client')
     @pytest.fixture(autouse=True)
     def setup_storage(self, setup_module):
-        he = HashEnricher()
         self.storage = setup_module(self.module_name, self.config)
 
-    def test_client_initialization(self, setup_storage):
+    def test_client_initialization(self):
         """Test that S3 client is initialized with correct parameters"""
         assert self.storage.s3 is not None
         assert self.storage.s3.meta.region_name == 'test-region'
@@ -138,37 +45,55 @@ class TestGDriveStorage:
         media.key = "another/path.jpg"
         assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
 
+    def test_uploadf_sets_acl_public(self):
+        media = Media("test.txt")
+        mock_file = MagicMock()
+        with patch.object(self.storage.s3, 'upload_fileobj') as mock_s3_upload,  \
+            patch.object(self.storage, 'is_upload_needed', return_value=True):
+            self.storage.uploadf(mock_file, media)
+            mock_s3_upload.assert_called_once_with(
+                mock_file,
+                Bucket='test-bucket',
+                Key=media.key,
+                ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
+            )
+
     def test_upload_decision_logic(self):
         """Test is_upload_needed under different conditions"""
         media = Media("test.txt")
-
-        # Test random_no_duplicate disabled
+        # Test default state (random_no_duplicate=False)
         assert self.storage.is_upload_needed(media) is True
+        # Set duplicate checking config to true:
 
-        # Test duplicate exists
         self.storage.random_no_duplicate = True
-        with patch.object(self.storage, 'file_in_folder', return_value='existing.txt'):
+        with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calc_hash, \
+                patch.object(self.storage, 'file_in_folder') as mock_file_in_folder:
+            mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123'
+            mock_file_in_folder.return_value = 'existing_key.txt'
+            # Test duplicate result
             assert self.storage.is_upload_needed(media) is False
-            assert media.key == 'existing.txt'
+            assert media.key == 'existing_key.txt'
+            mock_file_in_folder.assert_called_with(
+                # (first 24 chars of hash)
+                'no-dups/beepboop123beepboop123be'
+            )
+
 
     @patch.object(s3_storage.S3Storage, 'file_in_folder')
     def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder):
         """Test that upload skips when file_in_folder finds existing object"""
-        # Setup test-specific configuration
         self.storage.random_no_duplicate = True
         mock_file_in_folder.return_value = "existing_folder/existing_file.txt"
         # Create test media with calculated hash
         media = Media("test.txt")
         media.key = "original_path.txt"
 
-        # Mock hash calculation
-        with patch.object(self.storage, 'calculate_hash') as mock_calculate_hash:
-            mock_calculate_hash.return_value = "testhash123"
+        with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calculate_hash:
+            mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123"
             # Verify upload
             assert self.storage.is_upload_needed(media) is False
             assert media.key == "existing_folder/existing_file.txt"
             assert media.get("previously archived") is True
-
             with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload:
                 result = self.storage.uploadf(None, media)
                 mock_upload.assert_not_called()
@@ -177,21 +102,25 @@ class TestGDriveStorage:
     @patch.object(s3_storage.S3Storage, 'is_upload_needed')
     def test_uploads_with_correct_parameters(self, mock_upload_needed):
         media = Media("test.txt")
+        media.key = "original_key.txt"
         mock_upload_needed.return_value = True
         media.mimetype = 'image/png'
         mock_file = MagicMock()
 
         with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload:
             self.storage.uploadf(mock_file, media)
-
-            # Verify core upload parameters
+            # verify call occured with these params
             mock_upload.assert_called_once_with(
                 mock_file,
                 Bucket='test-bucket',
-                # Key='original_key.txt',
-                Key=None,
+                Key='original_key.txt',
                 ExtraArgs={
                     'ACL': 'public-read',
                     'ContentType': 'image/png'
                 }
-            )
\ No newline at end of file
+            )
+
+    def test_file_in_folder_exists(self):
+        with patch.object(self.storage.s3, 'list_objects') as mock_list_objects:
+            mock_list_objects.return_value = {'Contents': [{'Key': 'path/to/file.txt'}]}
+            assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
\ No newline at end of file

From 2920cf685f8c556cbdfa8d805f1eb20b8fe41d66 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Fri, 7 Feb 2025 12:35:40 +0000
Subject: [PATCH 05/17] Small fixes to whisper_enricher.py.

---
 src/auto_archiver/modules/whisper_enricher/__manifest__.py  | 6 ++++--
 .../modules/whisper_enricher/whisper_enricher.py            | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py
index f7ad1b3..884de66 100644
--- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py
@@ -6,8 +6,10 @@
         "python": ["s3_storage", "loguru", "requests"],
     },
     "configs": {
-        "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
-        "api_key": {"default": None, "help": "WhisperApi api key for authentication"},
+        "api_endpoint": {"required": True,
+                         "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
+        "api_key": {"required": True,
+                    "help": "WhisperApi api key for authentication"},
         "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
         "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
         "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
index 8ca2131..a7298e4 100644
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -110,7 +110,7 @@ class WhisperEnricher(Enricher):
 
     def _get_s3_storage(self) -> S3Storage:
         try:
-            return next(s for s in self.storages if s.__class__ == S3Storage)
+            return next(s for s in self.config['steps']['storages'] if s == 's3_storage')
         except:
             logger.warning("No S3Storage instance found in storages")
             return

From 950624dd4bb0e917abbe58c98351bbabd26d0bb3 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Fri, 7 Feb 2025 20:26:00 +0000
Subject: [PATCH 06/17] Fix S3 storage to media in whisper_enricher.py.

---
 .../modules/whisper_enricher/__manifest__.py  |  7 +++++--
 .../whisper_enricher/whisper_enricher.py      | 19 ++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py
index 884de66..1539df6 100644
--- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py
@@ -1,4 +1,4 @@
-{
+a={
     "name": "Whisper Enricher",
     "type": ["enricher"],
     "requires_setup": True,
@@ -12,7 +12,9 @@
                     "help": "WhisperApi api key for authentication"},
         "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
         "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
-        "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
+        "action": {"default": "translate",
+                   "help": "which Whisper operation to execute",
+                   "choices": ["transcribe", "translate", "language_detection"]},
     },
     "description": """
     Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
@@ -27,6 +29,7 @@
     ### Notes
     - Requires a Whisper API endpoint and API key for authentication.
     - Only compatible with S3-compatible storage systems for media file accessibility.
+    - ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
     - Handles multiple jobs and retries for failed or incomplete processing.
     """
 }
diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
index a7298e4..004d91c 100644
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -15,17 +15,21 @@ class WhisperEnricher(Enricher):
     """
 
     def enrich(self, to_enrich: Metadata) -> None:
-        if not self._get_s3_storage():
+        storages = self.config['steps']['storages']
+        if not "s3_storage" in storages:
             logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
             return
 
+        self.s3 = get_module("s3_storage", self.config)
         url = to_enrich.get_url()
         logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
 
         job_results = {}
         for i, m in enumerate(to_enrich.media):
             if m.is_video() or m.is_audio():
-                m.store(url=url, metadata=to_enrich, storages=self.storages)
+                # TODO: this used to pass all storage items to store now
+                # Now only passing S3, the rest will get added later in the usual order (?)
+                m.store(url=url, metadata=to_enrich, storages=[self.s3])
                 try:
                     job_id = self.submit_job(m)
                     job_results[job_id] = False
@@ -53,8 +57,8 @@ class WhisperEnricher(Enricher):
                             to_enrich.set_content(f"\n[automatic video transcript]: {v}")
 
     def submit_job(self, media: Media):
-        s3 = get_module("s3_storage", self.config)
-        s3_url = s3.get_cdn_url(media)
+
+        s3_url = self.s3.get_cdn_url(media)
         assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
         payload = {
             "url": s3_url,
@@ -107,10 +111,3 @@ class WhisperEnricher(Enricher):
             logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
             return result
         return False
-
-    def _get_s3_storage(self) -> S3Storage:
-        try:
-            return next(s for s in self.config['steps']['storages'] if s == 's3_storage')
-        except:
-            logger.warning("No S3Storage instance found in storages")
-            return

From f311621e58446983fb95d9e510249855a7687f61 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Mon, 10 Feb 2025 15:57:42 +0000
Subject: [PATCH 07/17] Small fixes. Add timestamp helper method.

---
 .../modules/gdrive_storage/gdrive_storage.py  |  7 +-
 .../modules/gsheet_db/gsheet_db.py            | 70 ++++++++++---------
 .../telethon_extractor/telethon_extractor.py  |  4 +-
 .../modules/whisper_enricher/__manifest__.py  |  2 +-
 .../whisper_enricher/whisper_enricher.py      | 13 ++--
 src/auto_archiver/utils/misc.py               | 36 +++++++++-
 tests/databases/test_gsheet_db.py             |  8 ++-
 .../test_instagram_api_extractor.py           |  3 +-
 .../test_instagram_tbot_extractor.py          |  1 -
 tests/feeders/test_gsheet_feeder.py           |  9 +--
 tests/storages/test_gdrive_storage.py         | 41 ++++++++---
 tests/test_metadata.py                        |  4 ++
 12 files changed, 129 insertions(+), 69 deletions(-)

diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
index cc9cf3d..910f48b 100644
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -70,12 +70,15 @@ class GDriveStorage(Storage):
         filename = path_parts[-1]
         logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}")
         for folder in path_parts[0:-1]:
-            folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
+            folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
             parent_id = folder_id
-
         # get id of file inside folder (or sub folder)
         # TODO: supressing the error as being checked before first upload
         file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False)
+        if not file_id:
+            #
+            logger.info(f"file {filename} not found in folder {folder_id}")
+            return None
         return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
 
     def upload(self, media: Media, **kwargs) -> bool:
diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
index 3bb27b7..682eb94 100644
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -1,6 +1,4 @@
 from typing import Union, Tuple
-
-import datetime
 from urllib.parse import quote
 
 from loguru import logger
@@ -8,33 +6,33 @@ from loguru import logger
 from auto_archiver.core import Database
 from auto_archiver.core import Metadata, Media
 from auto_archiver.modules.gsheet_feeder import GWorksheet
+from auto_archiver.utils.misc import get_current_timestamp
 
 
 class GsheetsDb(Database):
     """
-        NB: only works if GsheetFeeder is used.
-        could be updated in the future to support non-GsheetFeeder metadata
+    NB: only works if GsheetFeeder is used.
+    could be updated in the future to support non-GsheetFeeder metadata
     """
 
-
     def started(self, item: Metadata) -> None:
         logger.warning(f"STARTED {item}")
         gw, row = self._retrieve_gsheet(item)
-        gw.set_cell(row, 'status', 'Archive in progress')
+        gw.set_cell(row, "status", "Archive in progress")
 
-    def failed(self, item: Metadata, reason:str) -> None:
+    def failed(self, item: Metadata, reason: str) -> None:
         logger.error(f"FAILED {item}")
-        self._safe_status_update(item, f'Archive failed {reason}')
+        self._safe_status_update(item, f"Archive failed {reason}")
 
     def aborted(self, item: Metadata) -> None:
         logger.warning(f"ABORTED {item}")
-        self._safe_status_update(item, '')
+        self._safe_status_update(item, "")
 
     def fetch(self, item: Metadata) -> Union[Metadata, bool]:
         """check if the given item has been archived already"""
         return False
 
-    def done(self, item: Metadata, cached: bool=False) -> None:
+    def done(self, item: Metadata, cached: bool = False) -> None:
         """archival result ready - should be saved to DB"""
         logger.success(f"DONE {item.get_url()}")
         gw, row = self._retrieve_gsheet(item)
@@ -46,23 +44,25 @@ class GsheetsDb(Database):
         def batch_if_valid(col, val, final_value=None):
             final_value = final_value or val
             try:
-                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
                     cell_updates.append((row, col, final_value))
             except Exception as e:
                 logger.error(f"Unable to batch {col}={final_value} due to {e}")
+
         status_message = item.status
         if cached:
             status_message = f"[cached] {status_message}"
-        cell_updates.append((row, 'status', status_message))
+        cell_updates.append((row, "status", status_message))
 
         media: Media = item.get_final_media()
         if hasattr(media, "urls"):
-            batch_if_valid('archive', "\n".join(media.urls))
-        batch_if_valid('date', True, self._get_current_datetime_iso())
-        batch_if_valid('title', item.get_title())
-        batch_if_valid('text', item.get("content", ""))
-        batch_if_valid('timestamp', item.get_timestamp())
-        if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
+            batch_if_valid("archive", "\n".join(media.urls))
+        batch_if_valid("date", True, get_current_timestamp())
+        batch_if_valid("title", item.get_title())
+        batch_if_valid("text", item.get("content", ""))
+        batch_if_valid("timestamp", item.get_timestamp())
+        if media:
+            batch_if_valid("hash", media.get("hash", "not-calculated"))
 
         # merge all pdq hashes into a single string, if present
         pdq_hashes = []
@@ -71,31 +71,35 @@ class GsheetsDb(Database):
             if pdq := m.get("pdq_hash"):
                 pdq_hashes.append(pdq)
         if len(pdq_hashes):
-            batch_if_valid('pdq_hash', ",".join(pdq_hashes))
+            batch_if_valid("pdq_hash", ",".join(pdq_hashes))
 
-        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
-            batch_if_valid('screenshot', "\n".join(screenshot.urls))
+        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
+            screenshot, "urls"
+        ):
+            batch_if_valid("screenshot", "\n".join(screenshot.urls))
 
-        if (thumbnail := item.get_first_image("thumbnail")):
+        if thumbnail := item.get_first_image("thumbnail"):
             if hasattr(thumbnail, "urls"):
-                batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
+                batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
 
-        if (browsertrix := item.get_media_by_id("browsertrix")):
-            batch_if_valid('wacz', "\n".join(browsertrix.urls))
-            batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
+        if browsertrix := item.get_media_by_id("browsertrix"):
+            batch_if_valid("wacz", "\n".join(browsertrix.urls))
+            batch_if_valid(
+                "replaywebpage",
+                "\n".join(
+                    [
+                        f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
+                        for wacz in browsertrix.urls
+                    ]
+                ),
+            )
 
         gw.batch_set_cell(cell_updates)
 
-    @staticmethod
-    def _get_current_datetime_iso() -> str:
-        """Helper method to generate the current datetime in ISO format."""
-        return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()
-
-
     def _safe_status_update(self, item: Metadata, new_status: str) -> None:
         try:
             gw, row = self._retrieve_gsheet(item)
-            gw.set_cell(row, 'status', new_status)
+            gw.set_cell(row, "status", new_status)
         except Exception as e:
             logger.debug(f"Unable to update sheet: {e}")
 
diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
index 0147ff2..947db9e 100644
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -18,12 +18,14 @@ class TelethonExtractor(Extractor):
     invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
 
 
-    def setup(self) -> None:
+    def setup(self, config: dict) -> None:
+
         """
         1. makes a copy of session_file that is removed in cleanup
         2. trigger login process for telegram or proceed if already saved in a session file
         3. joins channel_invites where needed
         """
+        super().setup(config)
         logger.info(f"SETUP {self.name} checking login...")
 
         # make a copy of the session that is used exclusively with this archiver instance
diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py
index 1539df6..98e743e 100644
--- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py
@@ -1,4 +1,4 @@
-a={
+{
     "name": "Whisper Enricher",
     "type": ["enricher"],
     "requires_setup": True,
diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
index 004d91c..a51ffc1 100644
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -4,7 +4,6 @@ from loguru import logger
 
 from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, Media
-from auto_archiver.modules.s3_storage import S3Storage
 from auto_archiver.core.module import get_module
 
 class WhisperEnricher(Enricher):
@@ -14,13 +13,17 @@ class WhisperEnricher(Enricher):
     Only works if an S3 compatible storage is used
     """
 
-    def enrich(self, to_enrich: Metadata) -> None:
-        storages = self.config['steps']['storages']
-        if not "s3_storage" in storages:
+    def setup(self, config: dict) -> None:
+        super().setup(config)
+        self.stores = self.config['steps']['storages']
+        self.s3 = get_module("s3_storage", self.config)
+        if not "s3_storage" in self.stores:
             logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
             return
 
-        self.s3 = get_module("s3_storage", self.config)
+
+    def enrich(self, to_enrich: Metadata) -> None:
+
         url = to_enrich.get_url()
         logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
 
diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py
index 300a710..e4c214c 100644
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@@ -1,9 +1,7 @@
-
-
 import os
 import json
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 import requests
 from loguru import logger
 
@@ -58,5 +56,37 @@ def random_str(length: int = 32) -> str:
     assert length <= 32, "length must be less than 32 as UUID4 is used"
     return str(uuid.uuid4()).replace("-", "")[:length]
 
+
 def json_loader(cli_val):
     return json.loads(cli_val)
+
+def get_current_datetime_iso() -> str:
+    return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat()
+
+
+def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None:
+    # parse a datetime string with option of passing a specific format
+    try:
+        return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str)
+    except ValueError as e:
+        logger.error(f"Unable to parse datestring {dt_str}: {e}")
+        return None
+
+
+def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
+    # Consistent parsing of timestamps
+    # If utc=True, the timezone is set to UTC,
+    # if iso=True, the output is an iso string
+    if not ts: return
+    try:
+        if isinstance(ts, str): ts = datetime.fromisoformat(ts)
+        if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
+        if utc: ts = ts.replace(tzinfo=timezone.utc)
+        if iso: return ts.isoformat()
+        return ts
+    except Exception as e:
+        logger.error(f"Unable to parse timestamp {ts}: {e}")
+        return None
+
+def get_current_timestamp() -> str:
+    return get_timestamp(datetime.now())
\ No newline at end of file
diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py
index bdc2811..0a655a8 100644
--- a/tests/databases/test_gsheet_db.py
+++ b/tests/databases/test_gsheet_db.py
@@ -103,19 +103,20 @@ def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
     gsheets_db.failed(mock_metadata, reason)
     mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
 
+
 def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
     gsheets_db.aborted(mock_metadata)
     mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
 
 
 def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls):
-    with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
+    with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
         gsheets_db.done(metadata)
     mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
 
 
 def test_done_cached(gsheets_db, metadata, mock_gworksheet):
-    with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
+    with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
         gsheets_db.done(metadata, cached=True)
 
     # Verify the status message includes "[cached]"
@@ -126,7 +127,8 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet):
 def test_done_missing_media(gsheets_db, metadata, mock_gworksheet):
     # clear media from metadata
     metadata.media = []
-    with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
+    with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp",
+               return_value='2025-02-01T00:00:00+00:00'):
         gsheets_db.done(metadata)
     # Verify nothing media-related gets updated
     call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py
index d3f7bd6..c119e3f 100644
--- a/tests/extractors/test_instagram_api_extractor.py
+++ b/tests/extractors/test_instagram_api_extractor.py
@@ -185,5 +185,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
             result = self.extractor.download_profile(metadata, "test_user")
 
             assert result.is_success()
-            assert "Error downloading stories for test_user" in result.metadata["errors"]
-            # assert "Error downloading posts for test_user" in result.metadata["errors"]
\ No newline at end of file
+            assert "Error downloading stories for test_user" in result.metadata["errors"]
\ No newline at end of file
diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py
index b82641d..d7a1e53 100644
--- a/tests/extractors/test_instagram_tbot_extractor.py
+++ b/tests/extractors/test_instagram_tbot_extractor.py
@@ -1,5 +1,4 @@
 import os
-import pickle
 from typing import Type
 from unittest.mock import patch, MagicMock
 
diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py
index 103610e..ecf57f1 100644
--- a/tests/feeders/test_gsheet_feeder.py
+++ b/tests/feeders/test_gsheet_feeder.py
@@ -7,10 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
 from auto_archiver.core import Metadata, Feeder
 
 
-def test_initialise_without_sheet_and_sheet_id(setup_module):
-    """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set.
-    (shouldn't really be asserting in there)
-    """
+def test_setup_without_sheet_and_sheet_id(setup_module):
+    # Ensure setup() raises AssertionError if neither sheet nor sheet_id is set.
     with patch("gspread.service_account"):
         with pytest.raises(AssertionError):
             setup_module(
@@ -145,7 +143,6 @@ def test_open_sheet_with_name_or_id(
             "gsheet_feeder",
             {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
         )
-        feeder.initialise()
         sheet_result = feeder.open_sheet()
         # Validate the correct method was called
         getattr(mock_client, expected_method).assert_called_once_with(
@@ -165,7 +162,6 @@ def test_open_sheet_with_sheet_id(setup_module):
             "gsheet_feeder",
             {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
         )
-        feeder.initialise()
         sheet = feeder.open_sheet()
         mock_client.open_by_key.assert_called_once_with("ABC123")
         assert sheet == "MockSheet"
@@ -263,7 +259,6 @@ class TestGSheetsFeederReal:
             ["https://example.com", "done"],
         ]
         worksheet.append_rows(test_rows)
-        self.feeder.initialise()
         metadata_list = list(self.feeder)
 
         # Validate that only the first row is processed
diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py
index b7417ad..4259cb2 100644
--- a/tests/storages/test_gdrive_storage.py
+++ b/tests/storages/test_gdrive_storage.py
@@ -21,16 +21,6 @@ class TestGDriveStorage(TestStorageBase):
             'service_account': 'fake_service_account.json'
                     }
 
-    @pytest.mark.skip(reason="Requires real credentials")
-    @pytest.mark.download
-    def test_initialize_with_real_credentials(self):
-        """
-        Test that the Google Drive service can be initialized with real credentials.
-        """
-        self.storage.service_account = 'secrets/service_account.json'  # Path to real credentials
-        self.storage.initialise()
-        assert self.storage.service is not None
-
 
     def test_initialize_fails_with_non_existent_creds(self):
         """
@@ -38,6 +28,35 @@ class TestGDriveStorage(TestStorageBase):
         """
         # Act and Assert
         with pytest.raises(FileNotFoundError) as exc_info:
-            self.storage.initialise()
+            self.storage.setup(self.config)
         assert "No such file or directory" in str(exc_info.value)
 
+    def test_path_parts(self):
+        media = Media(filename="test.jpg")
+        media.key = "folder1/folder2/test.jpg"
+
+# @pytest.mark.skip(reason="Requires real credentials")
+@pytest.mark.download
+class TestGDriveStorageConnected(TestStorageBase):
+    """
+    'Real' tests for GDriveStorage.
+    """
+
+    module_name: str = "gdrive_storage"
+    storage: Type[GDriveStorage]
+    config: dict = {'path_generator': 'url',
+            'filename_generator': 'static',
+            # TODO: replace with real root folder id
+            'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
+            'oauth_token': None,
+            'service_account': 'secrets/service_account.json'
+                    }
+
+
+    def test_initialize_with_real_credentials(self):
+        """
+        Test that the Google Drive service can be initialized with real credentials.
+        """
+        assert self.storage.service is not None
+
+
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index 7270c80..b07e107 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -159,3 +159,7 @@ def test_get_context():
     assert m.get_context("somekey") == "somevalue"
     assert m.get_context("anotherkey") == "anothervalue"
     assert len(m._context) == 2
+
+
+def test_choose_most_complete():
+    pass
\ No newline at end of file

From 2c3d1f591f4a721597e2cd9906c1cdc05db8a78e Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Mon, 10 Feb 2025 17:25:15 +0000
Subject: [PATCH 08/17] Separate setup() and module_setup().

---
 src/auto_archiver/core/base_module.py                         | 4 ++++
 src/auto_archiver/core/module.py                              | 1 +
 src/auto_archiver/modules/gdrive_storage/gdrive_storage.py    | 4 +---
 src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py      | 3 +--
 src/auto_archiver/modules/html_formatter/html_formatter.py    | 3 +--
 .../instagram_api_extractor/instagram_api_extractor.py        | 3 +--
 .../modules/instagram_extractor/instagram_extractor.py        | 3 +--
 .../instagram_tbot_extractor/instagram_tbot_extractor.py      | 3 +--
 src/auto_archiver/modules/s3_storage/s3_storage.py            | 3 +--
 .../modules/telethon_extractor/telethon_extractor.py          | 3 +--
 .../modules/twitter_api_extractor/twitter_api_extractor.py    | 4 +---
 src/auto_archiver/modules/vk_extractor/vk_extractor.py        | 3 +--
 src/auto_archiver/modules/wacz_enricher/wacz_enricher.py      | 3 +--
 .../modules/whisper_enricher/whisper_enricher.py              | 3 +--
 14 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py
index 5c6ecbb..95575e3 100644
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -80,6 +80,10 @@ class BaseModule(ABC):
         for key, val in config.get(self.name, {}).items():
             setattr(self, key, val)
 
+    def module_setup(self):
+        # For any additional setup required by modules, e.g. autehntication
+        pass
+
     def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
         """
         Returns the authentication information for a given site. This is used to authenticate
diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py
index f3fbec5..69f9fcc 100644
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -242,6 +242,7 @@ class LazyBaseModule:
         default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
         config[self.name] = default_config  | config.get(self.name, {})
         instance.setup(config)
+        instance.module_setup()
         return instance
 
     def __repr__(self):
diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
index 910f48b..51c13c2 100644
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -19,9 +19,7 @@ from auto_archiver.core import Storage
 
 class GDriveStorage(Storage):
 
-    def setup(self, config: dict) -> None:
-        # Step 1: Call the BaseModule setup to dynamically assign configs
-        super().setup(config)
+    def module_setup(self) -> None:
         self.scopes = ['https://www.googleapis.com/auth/drive']
         # Initialize Google Drive service
         self._setup_google_drive_service()
diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
index 50bf430..dd98032 100644
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -21,8 +21,7 @@ from . import GWorksheet
 
 class GsheetsFeeder(Feeder):
 
-    def setup(self, config: dict):
-        super().setup(config)
+    def module_setup(self) -> None:
         self.gsheets_client = gspread.service_account(filename=self.service_account)
         # TODO mv to validators
         assert self.sheet or self.sheet_id, (
diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py
index 4da82c8..bbba097 100644
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -17,9 +17,8 @@ class HtmlFormatter(Formatter):
     environment: Environment = None
     template: any = None
 
-    def setup(self, config: dict) -> None:
+    def module_setup(self) -> None:
         """Sets up the Jinja2 environment and loads the template."""
-        super().setup(config)  # Ensure the base class logic is executed
         template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
         self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
 
diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
index 5dad0ba..367cc75 100644
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@@ -32,8 +32,7 @@ class InstagramAPIExtractor(Extractor):
         r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
     )
 
-    def setup(self, config: dict) -> None:
-        super().setup(config)
+    def module_setup(self) -> None:
         if self.api_endpoint[-1] == "/":
             self.api_endpoint = self.api_endpoint[:-1]
 
diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
index 3cf0362..e4e210f 100644
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -25,8 +25,7 @@ class InstagramExtractor(Extractor):
     profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
     # TODO: links to stories
 
-    def setup(self, config: dict) -> None:
-        super().setup(config)
+    def module_setup(self) -> None:
 
         self.insta = instaloader.Instaloader(
             download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
index 5660cd2..707dcc3 100644
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -27,12 +27,11 @@ class InstagramTbotExtractor(Extractor):
     https://t.me/instagram_load_bot
     """
 
-    def setup(self, configs) -> None:
+    def module_setup(self) -> None:
         """
         1. makes a copy of session_file that is removed in cleanup
         2. checks if the session file is valid
         """
-        super().setup(configs)
         logger.info(f"SETUP {self.name} checking login...")
         self._prepare_session_file()
         self._initialize_telegram_client()
diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py
index 2f85164..c77bbc3 100644
--- a/src/auto_archiver/modules/s3_storage/s3_storage.py
+++ b/src/auto_archiver/modules/s3_storage/s3_storage.py
@@ -13,8 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/"
 
 class S3Storage(Storage):
 
-    def setup(self, config: dict) -> None:
-        super().setup(config)
+    def module_setup(self) -> None:
         self.s3 = boto3.client(
             's3',
             region_name=self.region,
diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
index 97d3e94..3762f01 100644
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -18,14 +18,13 @@ class TelethonExtractor(Extractor):
     invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
 
 
-    def setup(self, config: dict) -> None:
+    def module_setup(self) -> None:
 
         """
         1. makes a copy of session_file that is removed in cleanup
         2. trigger login process for telegram or proceed if already saved in a session file
         3. joins channel_invites where needed
         """
-        super().setup(config)
         logger.info(f"SETUP {self.name} checking login...")
 
         # make a copy of the session that is used exclusively with this archiver instance
diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
index 6573475..0b27e22 100644
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@@ -15,9 +15,7 @@ class TwitterApiExtractor(Extractor):
 
     valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
 
-    def setup(self, config: dict) -> None:
-        super().setup(config)
-
+    def module_setup(self) -> None:
         self.api_index = 0
         self.apis = []
         if len(self.bearer_tokens):
diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
index 2d09138..0d1fc04 100644
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@@ -12,8 +12,7 @@ class VkExtractor(Extractor):
     Currently only works for /wall posts
     """
 
-    def setup(self, config: dict) -> None:
-        super().setup(config)
+    def module_setup(self) -> None:
         self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
 
     def download(self, item: Metadata) -> Metadata:
diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
index 1586b75..7d91f43 100644
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@@ -18,8 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
     When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
     """
 
-    def setup(self, configs) -> None:
-        super().setup(configs)
+    def module_setup(self) -> None:
 
         self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
         self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
index a51ffc1..d83319e 100644
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -13,8 +13,7 @@ class WhisperEnricher(Enricher):
     Only works if an S3 compatible storage is used
     """
 
-    def setup(self, config: dict) -> None:
-        super().setup(config)
+    def module_setup(self) -> None:
         self.stores = self.config['steps']['storages']
         self.s3 = get_module("s3_storage", self.config)
         if not "s3_storage" in self.stores:

From e97ccf8a736fc6bd01a0efdf9a54c8cca16d5d97 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Mon, 10 Feb 2025 18:07:47 +0000
Subject: [PATCH 09/17] Separate setup() and module_setup().

---
 src/auto_archiver/core/base_module.py                       | 6 +++---
 src/auto_archiver/core/module.py                            | 6 +++---
 src/auto_archiver/modules/gdrive_storage/gdrive_storage.py  | 2 +-
 src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py    | 2 +-
 src/auto_archiver/modules/html_formatter/html_formatter.py  | 2 +-
 .../instagram_api_extractor/instagram_api_extractor.py      | 2 +-
 .../modules/instagram_extractor/instagram_extractor.py      | 2 +-
 .../instagram_tbot_extractor/instagram_tbot_extractor.py    | 2 +-
 src/auto_archiver/modules/s3_storage/s3_storage.py          | 2 +-
 .../modules/telethon_extractor/telethon_extractor.py        | 2 +-
 .../modules/twitter_api_extractor/twitter_api_extractor.py  | 2 +-
 src/auto_archiver/modules/vk_extractor/vk_extractor.py      | 2 +-
 src/auto_archiver/modules/wacz_enricher/wacz_enricher.py    | 2 +-
 .../modules/whisper_enricher/whisper_enricher.py            | 2 +-
 14 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py
index 95575e3..ece4719 100644
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -14,7 +14,7 @@ class BaseModule(ABC):
     Base module class. All modules should inherit from this class.
 
     The exact methods a class implements will depend on the type of module it is,
-    however all modules have a .setup(config: dict) method to run any setup code
+    however modules can have a .setup() method to run any setup code
     (e.g. logging in to a site, spinning up a browser etc.)
 
     See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
@@ -60,7 +60,7 @@ class BaseModule(ABC):
     def storages(self) -> list:
         return self.config.get('storages', [])
 
-    def setup(self, config: dict):
+    def config_setup(self, config: dict):
 
         authentication = config.get('authentication', {})
         # extract out concatenated sites
@@ -80,7 +80,7 @@ class BaseModule(ABC):
         for key, val in config.get(self.name, {}).items():
             setattr(self, key, val)
 
-    def module_setup(self):
+    def setup(self):
         # For any additional setup required by modules, e.g. autehntication
         pass
 
diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py
index 69f9fcc..c81e26a 100644
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -58,7 +58,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
     
     This has all the information about the module, but does not load the module itself or its dependencies
     
-    To load an actual module, call .setup() on a laz module
+    To load an actual module, call .setup() on a lazy module
     
     """
     if module_name in _LAZY_LOADED_MODULES:
@@ -241,8 +241,8 @@ class LazyBaseModule:
         # merge the default config with the user config
         default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
         config[self.name] = default_config  | config.get(self.name, {})
-        instance.setup(config)
-        instance.module_setup()
+        instance.config_setup(config)
+        instance.setup()
         return instance
 
     def __repr__(self):
diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
index 51c13c2..f38feb6 100644
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -19,7 +19,7 @@ from auto_archiver.core import Storage
 
 class GDriveStorage(Storage):
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         self.scopes = ['https://www.googleapis.com/auth/drive']
         # Initialize Google Drive service
         self._setup_google_drive_service()
diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
index dd98032..8612d02 100644
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -21,7 +21,7 @@ from . import GWorksheet
 
 class GsheetsFeeder(Feeder):
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         self.gsheets_client = gspread.service_account(filename=self.service_account)
         # TODO mv to validators
         assert self.sheet or self.sheet_id, (
diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py
index bbba097..3691735 100644
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -17,7 +17,7 @@ class HtmlFormatter(Formatter):
     environment: Environment = None
     template: any = None
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         """Sets up the Jinja2 environment and loads the template."""
         template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
         self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
index 367cc75..a75e065 100644
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@@ -32,7 +32,7 @@ class InstagramAPIExtractor(Extractor):
         r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
     )
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         if self.api_endpoint[-1] == "/":
             self.api_endpoint = self.api_endpoint[:-1]
 
diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
index e4e210f..0af2c32 100644
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -25,7 +25,7 @@ class InstagramExtractor(Extractor):
     profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
     # TODO: links to stories
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
 
         self.insta = instaloader.Instaloader(
             download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
index 707dcc3..d4b7a8e 100644
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -27,7 +27,7 @@ class InstagramTbotExtractor(Extractor):
     https://t.me/instagram_load_bot
     """
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         """
         1. makes a copy of session_file that is removed in cleanup
         2. checks if the session file is valid
diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py
index c77bbc3..6590ac9 100644
--- a/src/auto_archiver/modules/s3_storage/s3_storage.py
+++ b/src/auto_archiver/modules/s3_storage/s3_storage.py
@@ -13,7 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/"
 
 class S3Storage(Storage):
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         self.s3 = boto3.client(
             's3',
             region_name=self.region,
diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
index 3762f01..65ea8cd 100644
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -18,7 +18,7 @@ class TelethonExtractor(Extractor):
     invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
 
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
 
         """
         1. makes a copy of session_file that is removed in cleanup
diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
index 0b27e22..72fd2f2 100644
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@@ -15,7 +15,7 @@ class TwitterApiExtractor(Extractor):
 
     valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         self.api_index = 0
         self.apis = []
         if len(self.bearer_tokens):
diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
index 0d1fc04..99527c4 100644
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@@ -12,7 +12,7 @@ class VkExtractor(Extractor):
     Currently only works for /wall posts
     """
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
 
     def download(self, item: Metadata) -> Metadata:
diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
index 7d91f43..c324c62 100644
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@@ -18,7 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
     When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
     """
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
 
         self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
         self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
index d83319e..89579f9 100644
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -13,7 +13,7 @@ class WhisperEnricher(Enricher):
     Only works if an S3 compatible storage is used
     """
 
-    def module_setup(self) -> None:
+    def setup(self) -> None:
         self.stores = self.config['steps']['storages']
         self.s3 = get_module("s3_storage", self.config)
         if not "s3_storage" in self.stores:

From 3dae2337a1e3a97b913780b58e45adbc1d0aff5a Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Mon, 10 Feb 2025 18:56:46 +0000
Subject: [PATCH 10/17] remove cdn_url check before storage.

---
 src/auto_archiver/core/media.py                            | 2 +-
 src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py
index 952a025..b6820ab 100644
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -65,7 +65,7 @@ class Media:
 
     def is_stored(self, in_storage) -> bool:
         # checks if the media is already stored in the given storage
-        return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url(self) in u])
+        return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
 
     def set(self, key: str, value: Any) -> Media:
         self.properties[key] = value
diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
index f38feb6..4971030 100644
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -68,11 +68,10 @@ class GDriveStorage(Storage):
         filename = path_parts[-1]
         logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}")
         for folder in path_parts[0:-1]:
-            folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
+            folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
             parent_id = folder_id
         # get id of file inside folder (or sub folder)
-        # TODO: supressing the error as being checked before first upload
-        file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False)
+        file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
         if not file_id:
             #
             logger.info(f"file {filename} not found in folder {folder_id}")

From a69ac3e509eed60f1801aca605531b6bc8f3e506 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 11 Feb 2025 09:46:22 +0000
Subject: [PATCH 11/17] Fix file hash reference in S3 tests

---
 tests/storages/test_S3_storage.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py
index 2594e73..e532a18 100644
--- a/tests/storages/test_S3_storage.py
+++ b/tests/storages/test_S3_storage.py
@@ -2,13 +2,12 @@ from typing import Type
 import pytest
 from unittest.mock import MagicMock, patch
 from auto_archiver.core import Media
-from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.modules.s3_storage import s3_storage
 
 
-class TestGDriveStorage:
+class TestS3Storage:
     """
-    Test suite for GDriveStorage.
+    Test suite for S3Storage.
     """
     module_name: str = "s3_storage"
     storage: Type[s3_storage]
@@ -66,7 +65,7 @@ class TestGDriveStorage:
         # Set duplicate checking config to true:
 
         self.storage.random_no_duplicate = True
-        with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calc_hash, \
+        with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calc_hash, \
                 patch.object(self.storage, 'file_in_folder') as mock_file_in_folder:
             mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123'
             mock_file_in_folder.return_value = 'existing_key.txt'
@@ -87,8 +86,7 @@ class TestGDriveStorage:
         # Create test media with calculated hash
         media = Media("test.txt")
         media.key = "original_path.txt"
-
-        with patch('auto_archiver.modules.hash_enricher.HashEnricher.calculate_hash') as mock_calculate_hash:
+        with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calculate_hash:
             mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123"
             # Verify upload
             assert self.storage.is_upload_needed(media) is False

From 18666ff027526b99114d2b4ffb6304f9b3a83461 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 11 Feb 2025 11:28:24 +0000
Subject: [PATCH 12/17] skip authenticated tests in test_gsheet_feeder.py

---
 tests/feeders/test_gsheet_feeder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py
index ecf57f1..bdf3e70 100644
--- a/tests/feeders/test_gsheet_feeder.py
+++ b/tests/feeders/test_gsheet_feeder.py
@@ -185,7 +185,7 @@ def test_should_process_sheet(setup_module):
     assert gdb.should_process_sheet("AnotherSheet") == False
 
 
-# @pytest.mark.skip(reason="Requires a real connection")
+@pytest.mark.skip(reason="Requires a real connection")
 class TestGSheetsFeederReal:
     """Testing GSheetsFeeder class"""
 

From 1792e02d1d32c99ca1a59aeb0cab33a74d3a783e Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 11 Feb 2025 11:34:36 +0000
Subject: [PATCH 13/17] skip authenticated tests in test_gdrive_storage.py

---
 tests/storages/test_gdrive_storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py
index 4259cb2..57480d0 100644
--- a/tests/storages/test_gdrive_storage.py
+++ b/tests/storages/test_gdrive_storage.py
@@ -35,7 +35,7 @@ class TestGDriveStorage(TestStorageBase):
         media = Media(filename="test.jpg")
         media.key = "folder1/folder2/test.jpg"
 
-# @pytest.mark.skip(reason="Requires real credentials")
+@pytest.mark.skip(reason="Requires real credentials")
 @pytest.mark.download
 class TestGDriveStorageConnected(TestStorageBase):
     """

From 89d9140d15eb9e4261abf27f9c71df47ef8efb07 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 11 Feb 2025 11:47:11 +0000
Subject: [PATCH 14/17] Fixed setup/ config_setup reference

---
 tests/storages/test_gdrive_storage.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py
index 57480d0..aba0a25 100644
--- a/tests/storages/test_gdrive_storage.py
+++ b/tests/storages/test_gdrive_storage.py
@@ -7,7 +7,7 @@ from auto_archiver.core.metadata import Metadata
 from tests.storages.test_storage_base import TestStorageBase
 
 
-class TestGDriveStorage(TestStorageBase):
+class TestGDriveStorage:
     """
     Test suite for GDriveStorage.
     """
@@ -21,6 +21,10 @@ class TestGDriveStorage(TestStorageBase):
             'service_account': 'fake_service_account.json'
                     }
 
+    @pytest.fixture(autouse=True)
+    def gdrive(self, setup_module):
+        with patch('google.oauth2.service_account.Credentials.from_service_account_file') as mock_creds:
+            self.storage = setup_module(self.module_name, self.config)
 
     def test_initialize_fails_with_non_existent_creds(self):
         """
@@ -28,13 +32,15 @@ class TestGDriveStorage(TestStorageBase):
         """
         # Act and Assert
         with pytest.raises(FileNotFoundError) as exc_info:
-            self.storage.setup(self.config)
+            self.storage.setup()
         assert "No such file or directory" in str(exc_info.value)
 
+
     def test_path_parts(self):
         media = Media(filename="test.jpg")
         media.key = "folder1/folder2/test.jpg"
 
+
 @pytest.mark.skip(reason="Requires real credentials")
 @pytest.mark.download
 class TestGDriveStorageConnected(TestStorageBase):

From f97ec6a9e0ac20268f045b661f2e080ff1eb8574 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 11 Feb 2025 11:58:28 +0000
Subject: [PATCH 15/17] Fixed S3 module import

---
 tests/storages/test_S3_storage.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py
index e532a18..2a5d026 100644
--- a/tests/storages/test_S3_storage.py
+++ b/tests/storages/test_S3_storage.py
@@ -2,7 +2,7 @@ from typing import Type
 import pytest
 from unittest.mock import MagicMock, patch
 from auto_archiver.core import Media
-from auto_archiver.modules.s3_storage import s3_storage
+from auto_archiver.modules.s3_storage import S3Storage
 
 
 class TestS3Storage:
@@ -10,7 +10,7 @@ class TestS3Storage:
     Test suite for S3Storage.
     """
     module_name: str = "s3_storage"
-    storage: Type[s3_storage]
+    storage: Type[S3Storage]
     s3: MagicMock
     config: dict = {
         "path_generator": "flat",
@@ -78,7 +78,7 @@ class TestS3Storage:
             )
 
 
-    @patch.object(s3_storage.S3Storage, 'file_in_folder')
+    @patch.object(S3Storage, 'file_in_folder')
     def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder):
         """Test that upload skips when file_in_folder finds existing object"""
         self.storage.random_no_duplicate = True
@@ -97,7 +97,7 @@ class TestS3Storage:
                 mock_upload.assert_not_called()
                 assert result is True
 
-    @patch.object(s3_storage.S3Storage, 'is_upload_needed')
+    @patch.object(S3Storage, 'is_upload_needed')
     def test_uploads_with_correct_parameters(self, mock_upload_needed):
         media = Media("test.txt")
         media.key = "original_key.txt"

From 5e2e93382ffc47893183aae83ff138055b0edeb8 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 11 Feb 2025 12:17:42 +0000
Subject: [PATCH 16/17] Test fixes for 3.10 compliance.

---
 tests/databases/test_gsheet_db.py   | 2 +-
 tests/feeders/test_gsheet_feeder.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py
index 0a655a8..32e8403 100644
--- a/tests/databases/test_gsheet_db.py
+++ b/tests/databases/test_gsheet_db.py
@@ -24,7 +24,7 @@ def mock_metadata():
     metadata.status = "done"
     metadata.get_title.return_value = "Example Title"
     metadata.get.return_value = "Example Content"
-    metadata.get_timestamp.return_value = "2025-01-01T00:00:00Z"
+    metadata.get_timestamp.return_value = "2025-01-01T00:00:00"
     metadata.get_final_media.return_value = MagicMock(spec=Media)
     metadata.get_all_media.return_value = []
     metadata.get_media_by_id.return_value = None
diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py
index bdf3e70..b86e329 100644
--- a/tests/feeders/test_gsheet_feeder.py
+++ b/tests/feeders/test_gsheet_feeder.py
@@ -52,7 +52,7 @@ def gsheet_feeder(setup_module) -> GsheetsFeeder:
     return feeder
 
 
-class TestWorksheet:
+class MockWorksheet:
     """
     mimics the bits we need from gworksheet
     """
@@ -91,7 +91,7 @@ class TestWorksheet:
 
 
 def test__process_rows(gsheet_feeder: GsheetsFeeder):
-    testworksheet = TestWorksheet()
+    testworksheet = MockWorksheet()
     metadata_items = list(gsheet_feeder._process_rows(testworksheet))
     assert len(metadata_items) == 3
     assert isinstance(metadata_items[0], Metadata)
@@ -99,7 +99,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
 
 
 def test__set_metadata(gsheet_feeder: GsheetsFeeder):
-    worksheet = TestWorksheet()
+    worksheet = MockWorksheet()
     metadata = Metadata()
     gsheet_feeder._set_context(metadata, worksheet, 1)
     assert metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
@@ -112,7 +112,7 @@ def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, workshe
 
 
 def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
-    testworksheet = TestWorksheet()
+    testworksheet = MockWorksheet()
     metadata = Metadata()
     testworksheet.wks.title = "TestSheet"
     gsheet_feeder._set_context(metadata, testworksheet, 6)

From d1d6cde008861f508b8689ff6fd30cdde2fccd3a Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 11 Feb 2025 12:27:48 +0000
Subject: [PATCH 17/17] Set mock timestamp without z format

---
 tests/databases/test_gsheet_db.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py
index 32e8403..18a22f1 100644
--- a/tests/databases/test_gsheet_db.py
+++ b/tests/databases/test_gsheet_db.py
@@ -41,7 +41,7 @@ def metadata():
     metadata.set_title("Example Title")
     metadata.set_content("Example Content")
     metadata.success("my-archiver")
-    metadata.set("timestamp", "2025-01-01T00:00:00Z")
+    metadata.set("timestamp", "2025-01-01T00:00:00")
     metadata.set("date", "2025-02-04T18:22:24.909112+00:00")
     return metadata