Google sheets feeder and database implemented.

pull/224/head
erinhmclark 2025-01-27 20:13:12 +00:00
rodzic 6c67effd8c
commit 57b3bec935
3 zmienionych plików z 39 dodań i 61 usunięć

Wyświetl plik

@ -1,6 +1,7 @@
{
"name": "Google Sheets Database",
"type": ["database"],
"entry_point": "gsheet_db::GsheetsDb",
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"],

Wyświetl plik

@ -7,30 +7,36 @@
"python": ["loguru", "gspread", "python-slugify"],
},
"configs": {
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
"columns": {
"default": {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'timestamp': 'upload timestamp',
'title': 'upload title',
'text': 'text content',
'screenshot': 'screenshot',
'hash': 'hash',
'pdq_hash': 'perceptual hashes',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
},
"help": "names of columns in the google sheet (stringified JSON object)",
"type": "auto_archiver.utils.json_loader",
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {
"default": None,
"help": "(alternative to sheet name) the id of the sheet to archive",
},
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path",
},
"columns": {
"default": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"help": "names of columns in the google sheet (stringified JSON object)",
"type": "auto_archiver.utils.json_loader",
},
"allow_worksheets": {
"default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
@ -43,7 +49,7 @@
"default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
"type": "bool",
}
},
},
"description": """
GsheetsFeeder
@ -61,5 +67,5 @@
### Notes
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
- Create the sheet using the template provided in the docs.
"""
""",
}

Wyświetl plik

@ -21,41 +21,13 @@ from . import GWorksheet
class GsheetsFeeder(Feeder):
def __init__(self) -> None:
"""
Initializes the GsheetsFeeder with preloaded configurations.
"""
super().__init__()
# Initialize the gspread client with the provided service account file
# self.gsheets_client = gspread.service_account(filename=self.config["service_account"])
#
# # Set up feeder-specific configurations from the config
# self.sheet_name = config.get("sheet")
# self.sheet_id = config.get("sheet_id")
# self.header = config.get("header", 1)
# self.columns = config.get("columns", {})
# assert self.sheet_name or self.sheet_id, (
# "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
# )
# # Configuration attributes
# self.sheet = config.get("sheet")
# self.sheet_id = config.get("sheet_id")
# self.header = config.get("header", 1)
# self.columns = config.get("columns", {})
# self.allow_worksheets = config.get("allow_worksheets", set())
# self.block_worksheets = config.get("block_worksheets", set())
# self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True)
# Ensure the header is an integer
# try:
# self.header = int(self.header)
# except ValueError:
# pass
# assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}"
# assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined."
#
def setup(self, config: dict):
super().setup(config)
self.gsheets_client = gspread.service_account(filename=self.service_account)
# TODO mv to validators
assert self.sheet or self.sheet_id, (
"You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
)
def open_sheet(self):
if self.sheet:
@ -63,7 +35,6 @@ class GsheetsFeeder(Feeder):
else: # self.sheet_id
return self.gsheets_client.open_by_key(self.sheet_id)
def __iter__(self) -> Metadata:
sh = self.open_sheet()
for ii, wks in enumerate(sh.worksheets()):