diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gsheet_db/__init__.py deleted file mode 100644 index 01fdee6..0000000 --- a/src/auto_archiver/modules/gsheet_db/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .gsheet_db import GsheetsDb \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py deleted file mode 100644 index cf95245..0000000 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ /dev/null @@ -1,38 +0,0 @@ -{ - "name": "Google Sheets Database", - "type": ["database"], - "entry_point": "gsheet_db::GsheetsDb", - "requires_setup": True, - "dependencies": { - "python": ["loguru", "gspread", "slugify"], - }, - "configs": { - "allow_worksheets": { - "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - }, - "block_worksheets": { - "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed", - }, - "use_sheet_names_in_stored_paths": { - "default": True, - "type": "bool", - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - } - }, - "description": """ - GsheetsDatabase: - Handles integration with Google Sheets for tracking archival tasks. - -### Features -- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. -- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. -- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. -- Skips redundant updates for empty or invalid data fields. - -### Notes -- Currently works only with metadata provided by GsheetFeeder. -- Requires configuration of a linked Google Sheet and appropriate API credentials. - """ -} diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py deleted file mode 100644 index c19f2ae..0000000 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import Union, Tuple -from urllib.parse import quote - -from loguru import logger - -from auto_archiver.core import Database -from auto_archiver.core import Metadata, Media -from auto_archiver.modules.gsheet_feeder import GWorksheet -from auto_archiver.utils.misc import get_current_timestamp - - -class GsheetsDb(Database): - """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata - """ - - def started(self, item: Metadata) -> None: - logger.warning(f"STARTED {item}") - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, "status", "Archive in progress") - - def failed(self, item: Metadata, reason: str) -> None: - logger.error(f"FAILED {item}") - self._safe_status_update(item, f"Archive failed {reason}") - - def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") - self._safe_status_update(item, "") - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check if the given item has been archived already""" - return False - - def done(self, item: Metadata, cached: bool = False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item.get_url()}") - gw, row = self._retrieve_gsheet(item) - # self._safe_status_update(item, 'done') - - cell_updates = [] - row_values = gw.get_row(row) - - def batch_if_valid(col, val, final_value=None): - final_value = final_value or val - try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": - cell_updates.append((row, col, final_value)) - except Exception as e: - logger.error(f"Unable to batch {col}={final_value} due to {e}") - - status_message = item.status - if cached: - status_message = f"[cached] {status_message}" - cell_updates.append((row, "status", status_message)) - - media: Media = item.get_final_media() - if hasattr(media, "urls"): - batch_if_valid("archive", "\n".join(media.urls)) - batch_if_valid("date", True, get_current_timestamp()) - batch_if_valid("title", item.get_title()) - batch_if_valid("text", item.get("content", "")) - batch_if_valid("timestamp", item.get_timestamp()) - if media: - batch_if_valid("hash", media.get("hash", "not-calculated")) - - # merge all pdq hashes into a single string, if present - pdq_hashes = [] - all_media = item.get_all_media() - for m in all_media: - if pdq := m.get("pdq_hash"): - pdq_hashes.append(pdq) - if len(pdq_hashes): - batch_if_valid("pdq_hash", ",".join(pdq_hashes)) - - if (screenshot := item.get_media_by_id("screenshot")) and hasattr( - screenshot, "urls" - ): - batch_if_valid("screenshot", "\n".join(screenshot.urls)) - - if thumbnail := item.get_first_image("thumbnail"): - if hasattr(thumbnail, "urls"): - batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') - - if browsertrix := item.get_media_by_id("browsertrix"): - batch_if_valid("wacz", "\n".join(browsertrix.urls)) - batch_if_valid( - "replaywebpage", - "\n".join( - [ - f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" - for wacz in browsertrix.urls - ] - ), - ) - - gw.batch_set_cell(cell_updates) - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: - try: - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, "status", new_status) - except Exception as e: - logger.debug(f"Unable to update sheet: {e}") - - def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - - if gsheet := item.get_context("gsheet"): - gw: GWorksheet = gsheet.get("worksheet") - row: int = gsheet.get("row") - elif self.sheet_id: - logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") - - return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py deleted file mode 100644 index bb4230a..0000000 --- a/src/auto_archiver/modules/gsheet_feeder/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .gworksheet import GWorksheet -from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py deleted file mode 100644 index 2026804..0000000 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. - -This reads data from Google Sheets and filters rows based on user-defined rules. -The filtered rows are processed into `Metadata` objects. - -### Key properties -- validates the sheet's structure and filters rows based on input configurations. -- Ensures only rows with valid URLs and unprocessed statuses are included. -""" -import os -import gspread - -from loguru import logger -from slugify import slugify - -from auto_archiver.core import Feeder -from auto_archiver.core import Metadata -from . import GWorksheet - - -class GsheetsFeeder(Feeder): - - def setup(self) -> None: - self.gsheets_client = gspread.service_account(filename=self.service_account) - # TODO mv to validators - if not self.sheet and not self.sheet_id: - raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.") - - def open_sheet(self): - if self.sheet: - return self.gsheets_client.open(self.sheet) - else: # self.sheet_id - return self.gsheets_client.open_by_key(self.sheet_id) - - def __iter__(self) -> Metadata: - sh = self.open_sheet() - for ii, worksheet in enumerate(sh.worksheets()): - if not self.should_process_sheet(worksheet.title): - logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") - continue - logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') - gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) - if len(missing_cols := self.missing_required_columns(gw)): - logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") - continue - - # process and yield metadata here: - yield from self._process_rows(gw) - logger.success(f'Finished worksheet {worksheet.title}') - - def _process_rows(self, gw: GWorksheet): - for row in range(1 + self.header, gw.count_rows() + 1): - url = gw.get_cell(row, 'url').strip() - if not len(url): continue - original_status = gw.get_cell(row, 'status') - status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) - # TODO: custom status parser(?) aka should_retry_from_status - if status not in ['', None]: continue - - # All checks done - archival process starts here - m = Metadata().set_url(url) - self._set_context(m, gw, row) - yield m - - def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: - - m.set_context("gsheet", {"row": row, "worksheet": gw}) - - if gw.get_cell_or_default(row, 'folder', "") is None: - folder = '' - else: - folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) - if len(folder): - if self.use_sheet_names_in_stored_paths: - m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) - else: - m.set_context("folder", folder) - - - def should_process_sheet(self, sheet_name: str) -> bool: - if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: - # ALLOW rules exist AND sheet name not explicitly allowed - return False - if len(self.block_worksheets) and sheet_name in self.block_worksheets: - # BLOCK rules exist AND sheet name is blocked - return False - return True - - def missing_required_columns(self, gw: GWorksheet) -> list: - missing = [] - for required_col in ['url', 'status']: - if not gw.col_exists(required_col): - missing.append(required_col) - return missing diff --git a/src/auto_archiver/modules/gsheet_feeder_db/__init__.py b/src/auto_archiver/modules/gsheet_feeder_db/__init__.py new file mode 100644 index 0000000..2e9ac02 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder_db/__init__.py @@ -0,0 +1,2 @@ +from .gworksheet import GWorksheet +from .gsheet_feeder_db import GsheetsFeederDB \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py similarity index 69% rename from src/auto_archiver/modules/gsheet_feeder/__manifest__.py rename to src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py index 130b9f6..bb2f447 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py @@ -1,7 +1,7 @@ { - "name": "Google Sheets Feeder", - "type": ["feeder"], - "entry_point": "gsheet_feeder::GsheetsFeeder", + "name": "Google Sheets Feeder Database", + "type": ["feeder", "database"], + "entry_point": "gsheet_feeder_db::GsheetsFeederDB", "requires_setup": True, "dependencies": { "python": ["loguru", "gspread", "slugify"], @@ -51,10 +51,23 @@ "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", "type": "bool", }, + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed", + }, + "use_sheet_names_in_stored_paths": { + "default": True, + "type": "bool", + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + } }, "description": """ - GsheetsFeeder - A Google Sheets-based feeder for the Auto Archiver. + GsheetsFeederDatabase + A Google Sheets-based feeder and optional database for the Auto Archiver. This reads data from Google Sheets and filters rows based on user-defined rules. The filtered rows are processed into `Metadata` objects. @@ -64,11 +77,16 @@ - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations. - Ensures only rows with valid URLs and unprocessed statuses are included for archival. - Supports organizing stored files into folder paths based on sheet and worksheet names. + - If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. + - Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. + - Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. + - Skips redundant updates for empty or invalid data fields. ### Setup - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`. To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html). - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive. - Customize the column names in your Google sheet using the `columns` configuration. + - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder. """, } diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py new file mode 100644 index 0000000..406eeb4 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -0,0 +1,196 @@ +""" +GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. + +This reads data from Google Sheets and filters rows based on user-defined rules. +The filtered rows are processed into `Metadata` objects. + +### Key properties +- validates the sheet's structure and filters rows based on input configurations. +- Ensures only rows with valid URLs and unprocessed statuses are included. +""" +import os +from typing import Tuple, Union +from urllib.parse import quote + +import gspread +from loguru import logger +from slugify import slugify + +from auto_archiver.core import Feeder, Database, Media +from auto_archiver.core import Metadata +from auto_archiver.modules.gsheet_feeder_db import GWorksheet +from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp + + +class GsheetsFeederDB(Feeder, Database): + + def setup(self) -> None: + self.gsheets_client = gspread.service_account(filename=self.service_account) + # TODO mv to validators + if not self.sheet and not self.sheet_id: + raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.") + + def open_sheet(self): + if self.sheet: + return self.gsheets_client.open(self.sheet) + else: # self.sheet_id + return self.gsheets_client.open_by_key(self.sheet_id) + + def __iter__(self) -> Metadata: + sh = self.open_sheet() + for ii, worksheet in enumerate(sh.worksheets()): + if not self.should_process_sheet(worksheet.title): + logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") + continue + logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') + gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) + if len(missing_cols := self.missing_required_columns(gw)): + logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") + continue + + # process and yield metadata here: + yield from self._process_rows(gw) + logger.success(f'Finished worksheet {worksheet.title}') + + def _process_rows(self, gw: GWorksheet): + for row in range(1 + self.header, gw.count_rows() + 1): + url = gw.get_cell(row, 'url').strip() + if not len(url): continue + original_status = gw.get_cell(row, 'status') + status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) + # TODO: custom status parser(?) aka should_retry_from_status + if status not in ['', None]: continue + + # All checks done - archival process starts here + m = Metadata().set_url(url) + self._set_context(m, gw, row) + yield m + + def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: + # TODO: Check folder value not being recognised + m.set_context("gsheet", {"row": row, "worksheet": gw}) + + if gw.get_cell_or_default(row, 'folder', "") is None: + folder = '' + else: + folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) + if len(folder): + if self.use_sheet_names_in_stored_paths: + m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) + else: + m.set_context("folder", folder) + + def should_process_sheet(self, sheet_name: str) -> bool: + if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: + # ALLOW rules exist AND sheet name not explicitly allowed + return False + if len(self.block_worksheets) and sheet_name in self.block_worksheets: + # BLOCK rules exist AND sheet name is blocked + return False + return True + + def missing_required_columns(self, gw: GWorksheet) -> list: + missing = [] + for required_col in ['url', 'status']: + if not gw.col_exists(required_col): + missing.append(required_col) + return missing + + + def started(self, item: Metadata) -> None: + logger.warning(f"STARTED {item}") + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, "status", "Archive in progress") + + def failed(self, item: Metadata, reason: str) -> None: + logger.error(f"FAILED {item}") + self._safe_status_update(item, f"Archive failed {reason}") + + def aborted(self, item: Metadata) -> None: + logger.warning(f"ABORTED {item}") + self._safe_status_update(item, "") + + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check if the given item has been archived already""" + return False + + def done(self, item: Metadata, cached: bool = False) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item.get_url()}") + gw, row = self._retrieve_gsheet(item) + # self._safe_status_update(item, 'done') + + cell_updates = [] + row_values = gw.get_row(row) + + def batch_if_valid(col, val, final_value=None): + final_value = final_value or val + try: + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": + cell_updates.append((row, col, final_value)) + except Exception as e: + logger.error(f"Unable to batch {col}={final_value} due to {e}") + + status_message = item.status + if cached: + status_message = f"[cached] {status_message}" + cell_updates.append((row, "status", status_message)) + + media: Media = item.get_final_media() + if hasattr(media, "urls"): + batch_if_valid("archive", "\n".join(media.urls)) + batch_if_valid("date", True, get_current_timestamp()) + batch_if_valid("title", item.get_title()) + batch_if_valid("text", item.get("content", "")) + batch_if_valid("timestamp", item.get_timestamp()) + if media: + batch_if_valid("hash", media.get("hash", "not-calculated")) + + # merge all pdq hashes into a single string, if present + pdq_hashes = [] + all_media = item.get_all_media() + for m in all_media: + if pdq := m.get("pdq_hash"): + pdq_hashes.append(pdq) + if len(pdq_hashes): + batch_if_valid("pdq_hash", ",".join(pdq_hashes)) + + if (screenshot := item.get_media_by_id("screenshot")) and hasattr( + screenshot, "urls" + ): + batch_if_valid("screenshot", "\n".join(screenshot.urls)) + + if thumbnail := item.get_first_image("thumbnail"): + if hasattr(thumbnail, "urls"): + batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') + + if browsertrix := item.get_media_by_id("browsertrix"): + batch_if_valid("wacz", "\n".join(browsertrix.urls)) + batch_if_valid( + "replaywebpage", + "\n".join( + [ + f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" + for wacz in browsertrix.urls + ] + ), + ) + + gw.batch_set_cell(cell_updates) + + def _safe_status_update(self, item: Metadata, new_status: str) -> None: + try: + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, "status", new_status) + except Exception as e: + logger.debug(f"Unable to update sheet: {e}") + + def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + + if gsheet := item.get_context("gsheet"): + gw: GWorksheet = gsheet.get("worksheet") + row: int = gsheet.get("row") + elif self.sheet_id: + logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") + + return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py similarity index 100% rename from src/auto_archiver/modules/gsheet_feeder/gworksheet.py rename to src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 8b49e5a..2f1202d 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -2,8 +2,7 @@ from datetime import datetime, timezone import pytest from auto_archiver.core import Metadata, Media -from auto_archiver.modules.gsheet_db import GsheetsDb -from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet @pytest.fixture @@ -32,8 +31,9 @@ def mock_metadata(mocker): @pytest.fixture def metadata(): metadata = Metadata() - metadata.add_media(Media(filename="screenshot.png", urls=["http://example.com/screenshot.png"]).set("id", "screenshot")) - metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]).set("id", "browsertrix")) + metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"])) + metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"])) + metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"])) metadata.set_url("http://example.com") metadata.set_title("Example Title") metadata.set_content("Example Content") @@ -52,12 +52,19 @@ def mock_media(mocker): return mock_media @pytest.fixture -def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsDb: - db = setup_module("gsheet_db", { - "allow_worksheets": "set()", - "block_worksheets": "set()", - "use_sheet_names_in_stored_paths": "True", - }) +def gsheets_db(mock_gworksheet, setup_module, mocker): + mocker.patch("gspread.service_account") + config: dict = { + "sheet": "testsheet", + "sheet_id": None, + "header": 1, + "service_account": "test/service_account.json", + "columns": {'url': 'link', 'status': 'archive status', 'folder': 'destination folder', 'archive': 'archive location', 'date': 'archive date', 'thumbnail': 'thumbnail', 'timestamp': 'upload timestamp', 'title': 'upload title', 'text': 'text content', 'screenshot': 'screenshot', 'hash': 'hash', 'pdq_hash': 'perceptual hashes', 'wacz': 'wacz', 'replaywebpage': 'replaywebpage'}, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + } + db = setup_module("gsheet_feeder_db", config) db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1)) return db @@ -79,10 +86,10 @@ def expected_calls(mock_media, fixed_timestamp): (1, 'text', 'Example Content'), (1, 'timestamp', '2025-01-01T00:00:00+00:00'), (1, 'hash', 'not-calculated'), - (1, 'screenshot', 'http://example.com/screenshot.png'), - (1, 'thumbnail', '=IMAGE("http://example.com/screenshot.png")'), - (1, 'wacz', 'http://example.com/browsertrix.wacz'), - (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A//example.com/browsertrix.wacz#view=pages&url=http%3A//example.com') + # (1, 'screenshot', 'http://example.com/screenshot.png'), + # (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'), + # (1, 'wacz', 'http://example.com/browsertrix.wacz'), + # (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=') ] def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet): @@ -107,13 +114,13 @@ def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker): - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata) mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker): - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata, cached=True) # Verify the status message includes "[cached]" @@ -124,7 +131,7 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker): def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker): # clear media from metadata metadata.media = [] - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata) # Verify nothing media-related gets updated call_args = mock_gworksheet.batch_set_cell.call_args[0][0] diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index ef150d1..9ca81b0 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -2,7 +2,7 @@ from typing import Type import gspread import pytest -from auto_archiver.modules.gsheet_feeder import GsheetsFeeder +from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB from auto_archiver.core import Metadata, Feeder @@ -11,13 +11,13 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker): mocker.patch("gspread.service_account") with pytest.raises(ValueError): setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, ) @pytest.fixture -def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder: +def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB: config: dict = { "service_account": "dummy.json", "sheet": "test-auto-archiver", @@ -45,7 +45,7 @@ def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder: } mocker.patch("gspread.service_account") feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", config ) feeder.gsheets_client = mocker.MagicMock() @@ -90,7 +90,7 @@ class MockWorksheet: return matching.get(col_name, default) -def test__process_rows(gsheet_feeder: GsheetsFeeder): +def test__process_rows(gsheet_feeder: GsheetsFeederDB): testworksheet = MockWorksheet() metadata_items = list(gsheet_feeder._process_rows(testworksheet)) assert len(metadata_items) == 3 @@ -98,7 +98,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): assert metadata_items[0].get("url") == "http://example.com" -def test__set_metadata(gsheet_feeder: GsheetsFeeder): +def test__set_metadata(gsheet_feeder: GsheetsFeederDB): worksheet = MockWorksheet() metadata = Metadata() gsheet_feeder._set_context(metadata, worksheet, 1) @@ -106,12 +106,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder): @pytest.mark.skip(reason="Not recognising folder column") -def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet): +def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet): gsheet_feeder._set_context(worksheet, 7) assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} -def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): +def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB): testworksheet = MockWorksheet() metadata = Metadata() testworksheet.wks.title = "TestSheet" @@ -140,7 +140,7 @@ def test_open_sheet_with_name_or_id( # Setup module with parameterized values feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, ) sheet_result = feeder.open_sheet() @@ -159,7 +159,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker): mock_service_account.return_value = mock_client mock_client.open_by_key.return_value = "MockSheet" feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, ) sheet = feeder.open_sheet() @@ -170,7 +170,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker): def test_should_process_sheet(setup_module, mocker): mocker.patch("gspread.service_account") gdb = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", { "service_account": "dummy.json", "sheet": "TestSheet", @@ -187,10 +187,10 @@ def test_should_process_sheet(setup_module, mocker): @pytest.mark.skip(reason="Requires a real connection") class TestGSheetsFeederReal: - """Testing GSheetsFeeder class""" + """Testing GsheetsFeeder class""" - module_name: str = "gsheet_feeder" - feeder: GsheetsFeeder + module_name: str = "gsheet_feeder_db" + feeder: GsheetsFeederDB # You must follow the setup process explain in the docs for this to work config: dict = { "service_account": "secrets/service_account.json",