kopia lustrzana https://github.com/bellingcat/auto-archiver
Google sheets feeder and database implemented.
rodzic
6c67effd8c
commit
57b3bec935
|
@ -1,6 +1,7 @@
|
|||
{
|
||||
"name": "Google Sheets Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "gsheet_db::GsheetsDb",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
|
|
|
@ -7,30 +7,36 @@
|
|||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {
|
||||
"default": None,
|
||||
"help": "(alternative to sheet name) the id of the sheet to archive",
|
||||
},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path",
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
|
@ -43,7 +49,7 @@
|
|||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool",
|
||||
}
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeeder
|
||||
|
@ -61,5 +67,5 @@
|
|||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
|
|
@ -21,41 +21,13 @@ from . import GWorksheet
|
|||
|
||||
class GsheetsFeeder(Feeder):
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""
|
||||
Initializes the GsheetsFeeder with preloaded configurations.
|
||||
"""
|
||||
super().__init__()
|
||||
# Initialize the gspread client with the provided service account file
|
||||
# self.gsheets_client = gspread.service_account(filename=self.config["service_account"])
|
||||
#
|
||||
# # Set up feeder-specific configurations from the config
|
||||
# self.sheet_name = config.get("sheet")
|
||||
# self.sheet_id = config.get("sheet_id")
|
||||
# self.header = config.get("header", 1)
|
||||
# self.columns = config.get("columns", {})
|
||||
# assert self.sheet_name or self.sheet_id, (
|
||||
# "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
|
||||
# )
|
||||
|
||||
|
||||
# # Configuration attributes
|
||||
# self.sheet = config.get("sheet")
|
||||
# self.sheet_id = config.get("sheet_id")
|
||||
# self.header = config.get("header", 1)
|
||||
# self.columns = config.get("columns", {})
|
||||
# self.allow_worksheets = config.get("allow_worksheets", set())
|
||||
# self.block_worksheets = config.get("block_worksheets", set())
|
||||
# self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True)
|
||||
|
||||
# Ensure the header is an integer
|
||||
# try:
|
||||
# self.header = int(self.header)
|
||||
# except ValueError:
|
||||
# pass
|
||||
# assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}"
|
||||
# assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined."
|
||||
#
|
||||
def setup(self, config: dict):
|
||||
super().setup(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
assert self.sheet or self.sheet_id, (
|
||||
"You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
|
||||
)
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
|
@ -63,7 +35,6 @@ class GsheetsFeeder(Feeder):
|
|||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
|
|
Ładowanie…
Reference in New Issue