Google sheets feeder and database implemented.

pull/224/head
erinhmclark 2025-01-27 20:13:12 +00:00
rodzic 6c67effd8c
commit 57b3bec935
3 zmienionych plików z 39 dodań i 61 usunięć

Wyświetl plik

@ -1,6 +1,7 @@
{ {
"name": "Google Sheets Database", "name": "Google Sheets Database",
"type": ["database"], "type": ["database"],
"entry_point": "gsheet_db::GsheetsDb",
"requires_setup": True, "requires_setup": True,
"external_dependencies": { "external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"], "python": ["loguru", "gspread", "python-slugify"],

Wyświetl plik

@ -7,30 +7,36 @@
"python": ["loguru", "gspread", "python-slugify"], "python": ["loguru", "gspread", "python-slugify"],
}, },
"configs": { "configs": {
"sheet": {"default": None, "help": "name of the sheet to archive"}, "sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, "sheet_id": {
"header": {"default": 1, "help": "index of the header row (starts at 1)"}, "default": None,
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, "help": "(alternative to sheet name) the id of the sheet to archive",
"columns": { },
"default": { "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
'url': 'link', "service_account": {
'status': 'archive status', "default": "secrets/service_account.json",
'folder': 'destination folder', "help": "service account JSON file path",
'archive': 'archive location', },
'date': 'archive date', "columns": {
'thumbnail': 'thumbnail', "default": {
'timestamp': 'upload timestamp', "url": "link",
'title': 'upload title', "status": "archive status",
'text': 'text content', "folder": "destination folder",
'screenshot': 'screenshot', "archive": "archive location",
'hash': 'hash', "date": "archive date",
'pdq_hash': 'perceptual hashes', "thumbnail": "thumbnail",
'wacz': 'wacz', "timestamp": "upload timestamp",
'replaywebpage': 'replaywebpage', "title": "upload title",
}, "text": "text content",
"help": "names of columns in the google sheet (stringified JSON object)", "screenshot": "screenshot",
"type": "auto_archiver.utils.json_loader", "hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
}, },
"help": "names of columns in the google sheet (stringified JSON object)",
"type": "auto_archiver.utils.json_loader",
},
"allow_worksheets": { "allow_worksheets": {
"default": set(), "default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
@ -43,7 +49,7 @@
"default": True, "default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
"type": "bool", "type": "bool",
} },
}, },
"description": """ "description": """
GsheetsFeeder GsheetsFeeder
@ -61,5 +67,5 @@
### Notes ### Notes
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
- Create the sheet using the template provided in the docs. - Create the sheet using the template provided in the docs.
""" """,
} }

Wyświetl plik

@ -21,41 +21,13 @@ from . import GWorksheet
class GsheetsFeeder(Feeder): class GsheetsFeeder(Feeder):
def __init__(self) -> None: def setup(self, config: dict):
""" super().setup(config)
Initializes the GsheetsFeeder with preloaded configurations. self.gsheets_client = gspread.service_account(filename=self.service_account)
""" # TODO mv to validators
super().__init__() assert self.sheet or self.sheet_id, (
# Initialize the gspread client with the provided service account file "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
# self.gsheets_client = gspread.service_account(filename=self.config["service_account"]) )
#
# # Set up feeder-specific configurations from the config
# self.sheet_name = config.get("sheet")
# self.sheet_id = config.get("sheet_id")
# self.header = config.get("header", 1)
# self.columns = config.get("columns", {})
# assert self.sheet_name or self.sheet_id, (
# "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
# )
# # Configuration attributes
# self.sheet = config.get("sheet")
# self.sheet_id = config.get("sheet_id")
# self.header = config.get("header", 1)
# self.columns = config.get("columns", {})
# self.allow_worksheets = config.get("allow_worksheets", set())
# self.block_worksheets = config.get("block_worksheets", set())
# self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True)
# Ensure the header is an integer
# try:
# self.header = int(self.header)
# except ValueError:
# pass
# assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}"
# assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined."
#
def open_sheet(self): def open_sheet(self):
if self.sheet: if self.sheet:
@ -63,7 +35,6 @@ class GsheetsFeeder(Feeder):
else: # self.sheet_id else: # self.sheet_id
return self.gsheets_client.open_by_key(self.sheet_id) return self.gsheets_client.open_by_key(self.sheet_id)
def __iter__(self) -> Metadata: def __iter__(self) -> Metadata:
sh = self.open_sheet() sh = self.open_sheet()
for ii, wks in enumerate(sh.worksheets()): for ii, wks in enumerate(sh.worksheets()):