2025-02-05 16:42:58 +00:00
|
|
|
from typing import Type
|
|
|
|
|
|
|
|
import gspread
|
|
|
|
import pytest
|
2025-02-25 21:32:32 +00:00
|
|
|
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB
|
2025-02-06 10:11:56 +00:00
|
|
|
from auto_archiver.core import Metadata, Feeder
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
|
2025-02-18 23:32:03 +00:00
|
|
|
def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
|
2025-02-10 15:57:42 +00:00
|
|
|
# Ensure setup() raises AssertionError if neither sheet nor sheet_id is set.
|
2025-02-18 23:32:03 +00:00
|
|
|
mocker.patch("gspread.service_account")
|
2025-02-20 13:13:01 +00:00
|
|
|
with pytest.raises(ValueError):
|
2025-02-18 23:32:03 +00:00
|
|
|
setup_module(
|
2025-02-25 21:32:32 +00:00
|
|
|
"gsheet_feeder_db",
|
2025-02-18 23:32:03 +00:00
|
|
|
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
|
|
|
|
)
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
2025-02-25 21:32:32 +00:00
|
|
|
def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB:
|
2025-02-18 23:32:03 +00:00
|
|
|
config: dict = {
|
2025-03-10 18:44:54 +00:00
|
|
|
"service_account": "dummy.json",
|
|
|
|
"sheet": "test-auto-archiver",
|
|
|
|
"sheet_id": None,
|
|
|
|
"header": 1,
|
|
|
|
"columns": {
|
|
|
|
"url": "link",
|
|
|
|
"status": "archive status",
|
|
|
|
"folder": "destination folder",
|
|
|
|
"archive": "archive location",
|
|
|
|
"date": "archive date",
|
|
|
|
"thumbnail": "thumbnail",
|
|
|
|
"timestamp": "upload timestamp",
|
|
|
|
"title": "upload title",
|
|
|
|
"text": "text content",
|
|
|
|
"screenshot": "screenshot",
|
|
|
|
"hash": "hash",
|
|
|
|
"pdq_hash": "perceptual hashes",
|
|
|
|
"wacz": "wacz",
|
|
|
|
"replaywebpage": "replaywebpage",
|
|
|
|
},
|
|
|
|
"allow_worksheets": set(),
|
|
|
|
"block_worksheets": set(),
|
|
|
|
"use_sheet_names_in_stored_paths": True,
|
|
|
|
}
|
2025-02-18 23:32:03 +00:00
|
|
|
mocker.patch("gspread.service_account")
|
2025-03-10 18:44:54 +00:00
|
|
|
feeder = setup_module("gsheet_feeder_db", config)
|
2025-02-18 23:32:03 +00:00
|
|
|
feeder.gsheets_client = mocker.MagicMock()
|
2025-02-05 16:42:58 +00:00
|
|
|
return feeder
|
|
|
|
|
|
|
|
|
2025-02-11 12:17:42 +00:00
|
|
|
class MockWorksheet:
|
2025-02-05 16:42:58 +00:00
|
|
|
"""
|
|
|
|
mimics the bits we need from gworksheet
|
|
|
|
"""
|
|
|
|
|
|
|
|
class SheetSheet:
|
|
|
|
title = "TestSheet"
|
|
|
|
|
|
|
|
rows = [
|
2025-02-06 16:53:00 +00:00
|
|
|
{"row": 2, "url": "http://example.com", "status": "", "folder": ""},
|
|
|
|
{"row": 3, "url": "http://example.com", "status": "", "folder": ""},
|
|
|
|
{"row": 4, "url": "", "status": "", "folder": ""},
|
|
|
|
{"row": 5, "url": "https://another.com", "status": None, "folder": ""},
|
|
|
|
{
|
|
|
|
"row": 6,
|
|
|
|
"url": "https://another.com",
|
|
|
|
"status": "success",
|
|
|
|
"folder": "some_folder",
|
|
|
|
},
|
|
|
|
]
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.wks = self.SheetSheet()
|
|
|
|
|
|
|
|
def count_rows(self):
|
|
|
|
if not self.rows:
|
|
|
|
return 0
|
|
|
|
return max(r["row"] for r in self.rows)
|
|
|
|
|
|
|
|
def get_cell(self, row, col_name, fresh=False):
|
|
|
|
matching = next((r for r in self.rows if r["row"] == row), {})
|
|
|
|
return matching.get(col_name, "")
|
|
|
|
|
|
|
|
def get_cell_or_default(self, row, col_name, default):
|
|
|
|
matching = next((r for r in self.rows if r["row"] == row), {})
|
|
|
|
return matching.get(col_name, default)
|
|
|
|
|
2025-02-06 16:53:00 +00:00
|
|
|
|
2025-02-25 21:32:32 +00:00
|
|
|
def test__process_rows(gsheet_feeder: GsheetsFeederDB):
|
2025-02-11 12:17:42 +00:00
|
|
|
testworksheet = MockWorksheet()
|
2025-02-05 16:42:58 +00:00
|
|
|
metadata_items = list(gsheet_feeder._process_rows(testworksheet))
|
|
|
|
assert len(metadata_items) == 3
|
|
|
|
assert isinstance(metadata_items[0], Metadata)
|
|
|
|
assert metadata_items[0].get("url") == "http://example.com"
|
|
|
|
|
2025-02-06 16:53:00 +00:00
|
|
|
|
2025-02-25 21:32:32 +00:00
|
|
|
def test__set_metadata(gsheet_feeder: GsheetsFeederDB):
|
2025-02-11 12:17:42 +00:00
|
|
|
worksheet = MockWorksheet()
|
2025-02-06 16:53:00 +00:00
|
|
|
metadata = Metadata()
|
|
|
|
gsheet_feeder._set_context(metadata, worksheet, 1)
|
|
|
|
assert metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skip(reason="Not recognising folder column")
|
2025-02-25 21:32:32 +00:00
|
|
|
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet):
|
2025-02-05 16:42:58 +00:00
|
|
|
gsheet_feeder._set_context(worksheet, 7)
|
2025-02-06 10:11:56 +00:00
|
|
|
assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
|
2025-02-25 21:32:32 +00:00
|
|
|
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB):
|
2025-02-11 12:17:42 +00:00
|
|
|
testworksheet = MockWorksheet()
|
2025-02-06 16:53:00 +00:00
|
|
|
metadata = Metadata()
|
2025-02-05 16:42:58 +00:00
|
|
|
testworksheet.wks.title = "TestSheet"
|
2025-02-06 16:53:00 +00:00
|
|
|
gsheet_feeder._set_context(metadata, testworksheet, 6)
|
|
|
|
assert metadata.get_context("gsheet") == {"row": 6, "worksheet": testworksheet}
|
|
|
|
assert metadata.get_context("folder") == "some-folder/test-auto-archiver/testsheet"
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.usefixtures("setup_module")
|
2025-02-06 16:53:00 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"sheet, sheet_id, expected_method, expected_arg, description",
|
|
|
|
[
|
|
|
|
("TestSheet", None, "open", "TestSheet", "opening by sheet name"),
|
|
|
|
(None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"),
|
|
|
|
],
|
|
|
|
)
|
2025-03-10 18:44:54 +00:00
|
|
|
def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker):
|
2025-02-05 16:42:58 +00:00
|
|
|
"""Ensure open_sheet() correctly opens by name or ID based on configuration."""
|
2025-02-18 23:32:03 +00:00
|
|
|
mock_service_account = mocker.patch("gspread.service_account")
|
|
|
|
mock_client = mocker.MagicMock()
|
|
|
|
mock_service_account.return_value = mock_client
|
|
|
|
mock_client.open.return_value = "MockSheet"
|
|
|
|
mock_client.open_by_key.return_value = "MockSheet"
|
|
|
|
|
|
|
|
# Setup module with parameterized values
|
|
|
|
feeder = setup_module(
|
2025-02-25 21:32:32 +00:00
|
|
|
"gsheet_feeder_db",
|
2025-02-18 23:32:03 +00:00
|
|
|
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
|
|
|
|
)
|
|
|
|
sheet_result = feeder.open_sheet()
|
|
|
|
# Validate the correct method was called
|
2025-03-10 18:44:54 +00:00
|
|
|
getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}"
|
2025-02-18 23:32:03 +00:00
|
|
|
assert sheet_result == "MockSheet", f"Failed: {description}"
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.usefixtures("setup_module")
|
2025-02-18 23:32:03 +00:00
|
|
|
def test_open_sheet_with_sheet_id(setup_module, mocker):
|
2025-02-05 16:42:58 +00:00
|
|
|
"""Ensure open_sheet() correctly opens a sheet by ID."""
|
2025-02-18 23:32:03 +00:00
|
|
|
mock_service_account = mocker.patch("gspread.service_account")
|
|
|
|
mock_client = mocker.MagicMock()
|
|
|
|
mock_service_account.return_value = mock_client
|
|
|
|
mock_client.open_by_key.return_value = "MockSheet"
|
|
|
|
feeder = setup_module(
|
2025-02-25 21:32:32 +00:00
|
|
|
"gsheet_feeder_db",
|
2025-02-18 23:32:03 +00:00
|
|
|
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
|
|
|
|
)
|
|
|
|
sheet = feeder.open_sheet()
|
|
|
|
mock_client.open_by_key.assert_called_once_with("ABC123")
|
|
|
|
assert sheet == "MockSheet"
|
|
|
|
|
|
|
|
|
|
|
|
def test_should_process_sheet(setup_module, mocker):
|
|
|
|
mocker.patch("gspread.service_account")
|
|
|
|
gdb = setup_module(
|
2025-02-25 21:32:32 +00:00
|
|
|
"gsheet_feeder_db",
|
2025-02-18 23:32:03 +00:00
|
|
|
{
|
|
|
|
"service_account": "dummy.json",
|
|
|
|
"sheet": "TestSheet",
|
|
|
|
"sheet_id": None,
|
|
|
|
"allow_worksheets": {"TestSheet", "Sheet2"},
|
|
|
|
"block_worksheets": {"Sheet3"},
|
|
|
|
},
|
|
|
|
)
|
2025-03-12 14:27:45 +00:00
|
|
|
assert gdb.should_process_sheet("TestSheet") is True
|
|
|
|
assert gdb.should_process_sheet("Sheet3") is False
|
2025-02-05 16:42:58 +00:00
|
|
|
# False if allow_worksheets is set
|
2025-03-12 14:27:45 +00:00
|
|
|
assert gdb.should_process_sheet("AnotherSheet") is False
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
|
2025-02-11 11:28:24 +00:00
|
|
|
@pytest.mark.skip(reason="Requires a real connection")
|
2025-02-05 16:42:58 +00:00
|
|
|
class TestGSheetsFeederReal:
|
2025-02-25 21:32:32 +00:00
|
|
|
"""Testing GsheetsFeeder class"""
|
2025-02-05 16:42:58 +00:00
|
|
|
|
2025-02-25 21:32:32 +00:00
|
|
|
module_name: str = "gsheet_feeder_db"
|
|
|
|
feeder: GsheetsFeederDB
|
2025-02-06 16:53:00 +00:00
|
|
|
# You must follow the setup process explain in the docs for this to work
|
2025-02-05 16:42:58 +00:00
|
|
|
config: dict = {
|
|
|
|
"service_account": "secrets/service_account.json",
|
|
|
|
"sheet": "test-auto-archiver",
|
|
|
|
"sheet_id": None,
|
|
|
|
"header": 1,
|
|
|
|
"columns": {
|
2025-02-06 16:53:00 +00:00
|
|
|
"url": "link",
|
|
|
|
"status": "archive status",
|
|
|
|
"folder": "destination folder",
|
|
|
|
"archive": "archive location",
|
|
|
|
"date": "archive date",
|
|
|
|
"thumbnail": "thumbnail",
|
|
|
|
"timestamp": "upload timestamp",
|
|
|
|
"title": "upload title",
|
|
|
|
"text": "text content",
|
|
|
|
"screenshot": "screenshot",
|
|
|
|
"hash": "hash",
|
|
|
|
"pdq_hash": "perceptual hashes",
|
|
|
|
"wacz": "wacz",
|
|
|
|
"replaywebpage": "replaywebpage",
|
|
|
|
},
|
2025-02-05 16:42:58 +00:00
|
|
|
"allow_worksheets": set(),
|
|
|
|
"block_worksheets": set(),
|
|
|
|
"use_sheet_names_in_stored_paths": True,
|
|
|
|
}
|
|
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
|
|
def setup_feeder(self, setup_module):
|
2025-03-10 18:44:54 +00:00
|
|
|
assert self.module_name is not None, "self.module_name must be set on the subclass"
|
2025-02-05 16:42:58 +00:00
|
|
|
assert self.config is not None, "self.config must be a dict set on the subclass"
|
2025-02-06 16:53:00 +00:00
|
|
|
self.feeder: Type[Feeder] = setup_module(self.module_name, self.config)
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
def reset_test_sheet(self):
|
|
|
|
"""Clears test sheet and re-adds headers to ensure consistent test results."""
|
|
|
|
client = gspread.service_account(self.config["service_account"])
|
|
|
|
sheet = client.open(self.config["sheet"])
|
|
|
|
worksheet = sheet.get_worksheet(0)
|
|
|
|
worksheet.clear()
|
|
|
|
worksheet.append_row(["Link", "Archive Status"])
|
|
|
|
|
2025-02-06 16:53:00 +00:00
|
|
|
def test_setup(self):
|
2025-02-05 16:42:58 +00:00
|
|
|
assert hasattr(self.feeder, "gsheets_client")
|
|
|
|
|
|
|
|
def test_open_sheet_real_connection(self):
|
|
|
|
"""Ensure open_sheet() connects to a real Google Sheets instance."""
|
|
|
|
sheet = self.feeder.open_sheet()
|
|
|
|
assert sheet is not None, "open_sheet() should return a valid sheet instance"
|
2025-03-10 18:44:54 +00:00
|
|
|
assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method"
|
2025-02-05 16:42:58 +00:00
|
|
|
|
|
|
|
def test_iter_yields_metadata_real_data(self):
|
|
|
|
"""Ensure __iter__() yields Metadata objects for real test sheet data."""
|
|
|
|
self.reset_test_sheet()
|
|
|
|
client = gspread.service_account(self.config["service_account"])
|
|
|
|
sheet = client.open(self.config["sheet"])
|
|
|
|
worksheet = sheet.get_worksheet(0)
|
|
|
|
# Insert test rows as a temp method
|
|
|
|
# Next we will refactor the feeder for better testing
|
|
|
|
test_rows = [
|
|
|
|
["https://example.com", ""],
|
|
|
|
["", ""],
|
|
|
|
["https://example.com", "done"],
|
|
|
|
]
|
|
|
|
worksheet.append_rows(test_rows)
|
|
|
|
metadata_list = list(self.feeder)
|
|
|
|
|
|
|
|
# Validate that only the first row is processed
|
|
|
|
assert len(metadata_list) == 1
|
|
|
|
assert metadata_list[0].metadata.get("url") == "https://example.com"
|
|
|
|
|
|
|
|
|
|
|
|
# TODO
|
|
|
|
|
|
|
|
# Test two sheets
|
|
|
|
# test two sheets with different columns
|
|
|
|
# test folder implementation
|