kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge GSheet Feeder and Database.
rodzic
011ded2bde
commit
077b56c150
|
@ -1 +0,0 @@
|
||||||
from .gsheet_db import GsheetsDb
|
|
|
@ -1,38 +0,0 @@
|
||||||
{
|
|
||||||
"name": "Google Sheets Database",
|
|
||||||
"type": ["database"],
|
|
||||||
"entry_point": "gsheet_db::GsheetsDb",
|
|
||||||
"requires_setup": True,
|
|
||||||
"dependencies": {
|
|
||||||
"python": ["loguru", "gspread", "slugify"],
|
|
||||||
},
|
|
||||||
"configs": {
|
|
||||||
"allow_worksheets": {
|
|
||||||
"default": set(),
|
|
||||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
|
||||||
},
|
|
||||||
"block_worksheets": {
|
|
||||||
"default": set(),
|
|
||||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
|
||||||
},
|
|
||||||
"use_sheet_names_in_stored_paths": {
|
|
||||||
"default": True,
|
|
||||||
"type": "bool",
|
|
||||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"description": """
|
|
||||||
GsheetsDatabase:
|
|
||||||
Handles integration with Google Sheets for tracking archival tasks.
|
|
||||||
|
|
||||||
### Features
|
|
||||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
|
||||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
|
||||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
|
||||||
- Skips redundant updates for empty or invalid data fields.
|
|
||||||
|
|
||||||
### Notes
|
|
||||||
- Currently works only with metadata provided by GsheetFeeder.
|
|
||||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
|
||||||
"""
|
|
||||||
}
|
|
|
@ -1,114 +0,0 @@
|
||||||
from typing import Union, Tuple
|
|
||||||
from urllib.parse import quote
|
|
||||||
|
|
||||||
from loguru import logger
|
|
||||||
|
|
||||||
from auto_archiver.core import Database
|
|
||||||
from auto_archiver.core import Metadata, Media
|
|
||||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
|
||||||
from auto_archiver.utils.misc import get_current_timestamp
|
|
||||||
|
|
||||||
|
|
||||||
class GsheetsDb(Database):
|
|
||||||
"""
|
|
||||||
NB: only works if GsheetFeeder is used.
|
|
||||||
could be updated in the future to support non-GsheetFeeder metadata
|
|
||||||
"""
|
|
||||||
|
|
||||||
def started(self, item: Metadata) -> None:
|
|
||||||
logger.warning(f"STARTED {item}")
|
|
||||||
gw, row = self._retrieve_gsheet(item)
|
|
||||||
gw.set_cell(row, "status", "Archive in progress")
|
|
||||||
|
|
||||||
def failed(self, item: Metadata, reason: str) -> None:
|
|
||||||
logger.error(f"FAILED {item}")
|
|
||||||
self._safe_status_update(item, f"Archive failed {reason}")
|
|
||||||
|
|
||||||
def aborted(self, item: Metadata) -> None:
|
|
||||||
logger.warning(f"ABORTED {item}")
|
|
||||||
self._safe_status_update(item, "")
|
|
||||||
|
|
||||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
|
||||||
"""check if the given item has been archived already"""
|
|
||||||
return False
|
|
||||||
|
|
||||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
|
||||||
"""archival result ready - should be saved to DB"""
|
|
||||||
logger.success(f"DONE {item.get_url()}")
|
|
||||||
gw, row = self._retrieve_gsheet(item)
|
|
||||||
# self._safe_status_update(item, 'done')
|
|
||||||
|
|
||||||
cell_updates = []
|
|
||||||
row_values = gw.get_row(row)
|
|
||||||
|
|
||||||
def batch_if_valid(col, val, final_value=None):
|
|
||||||
final_value = final_value or val
|
|
||||||
try:
|
|
||||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
|
||||||
cell_updates.append((row, col, final_value))
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
|
||||||
|
|
||||||
status_message = item.status
|
|
||||||
if cached:
|
|
||||||
status_message = f"[cached] {status_message}"
|
|
||||||
cell_updates.append((row, "status", status_message))
|
|
||||||
|
|
||||||
media: Media = item.get_final_media()
|
|
||||||
if hasattr(media, "urls"):
|
|
||||||
batch_if_valid("archive", "\n".join(media.urls))
|
|
||||||
batch_if_valid("date", True, get_current_timestamp())
|
|
||||||
batch_if_valid("title", item.get_title())
|
|
||||||
batch_if_valid("text", item.get("content", ""))
|
|
||||||
batch_if_valid("timestamp", item.get_timestamp())
|
|
||||||
if media:
|
|
||||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
|
||||||
|
|
||||||
# merge all pdq hashes into a single string, if present
|
|
||||||
pdq_hashes = []
|
|
||||||
all_media = item.get_all_media()
|
|
||||||
for m in all_media:
|
|
||||||
if pdq := m.get("pdq_hash"):
|
|
||||||
pdq_hashes.append(pdq)
|
|
||||||
if len(pdq_hashes):
|
|
||||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
|
||||||
|
|
||||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
|
||||||
screenshot, "urls"
|
|
||||||
):
|
|
||||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
|
||||||
|
|
||||||
if thumbnail := item.get_first_image("thumbnail"):
|
|
||||||
if hasattr(thumbnail, "urls"):
|
|
||||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
|
||||||
|
|
||||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
|
||||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
|
||||||
batch_if_valid(
|
|
||||||
"replaywebpage",
|
|
||||||
"\n".join(
|
|
||||||
[
|
|
||||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
|
||||||
for wacz in browsertrix.urls
|
|
||||||
]
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
gw.batch_set_cell(cell_updates)
|
|
||||||
|
|
||||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
|
||||||
try:
|
|
||||||
gw, row = self._retrieve_gsheet(item)
|
|
||||||
gw.set_cell(row, "status", new_status)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Unable to update sheet: {e}")
|
|
||||||
|
|
||||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
|
||||||
|
|
||||||
if gsheet := item.get_context("gsheet"):
|
|
||||||
gw: GWorksheet = gsheet.get("worksheet")
|
|
||||||
row: int = gsheet.get("row")
|
|
||||||
elif self.sheet_id:
|
|
||||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
|
||||||
|
|
||||||
return gw, row
|
|
|
@ -1,2 +0,0 @@
|
||||||
from .gworksheet import GWorksheet
|
|
||||||
from .gsheet_feeder import GsheetsFeeder
|
|
|
@ -1,95 +0,0 @@
|
||||||
"""
|
|
||||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
|
||||||
|
|
||||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
|
||||||
The filtered rows are processed into `Metadata` objects.
|
|
||||||
|
|
||||||
### Key properties
|
|
||||||
- validates the sheet's structure and filters rows based on input configurations.
|
|
||||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
import gspread
|
|
||||||
|
|
||||||
from loguru import logger
|
|
||||||
from slugify import slugify
|
|
||||||
|
|
||||||
from auto_archiver.core import Feeder
|
|
||||||
from auto_archiver.core import Metadata
|
|
||||||
from . import GWorksheet
|
|
||||||
|
|
||||||
|
|
||||||
class GsheetsFeeder(Feeder):
|
|
||||||
|
|
||||||
def setup(self) -> None:
|
|
||||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
|
||||||
# TODO mv to validators
|
|
||||||
if not self.sheet and not self.sheet_id:
|
|
||||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
|
||||||
|
|
||||||
def open_sheet(self):
|
|
||||||
if self.sheet:
|
|
||||||
return self.gsheets_client.open(self.sheet)
|
|
||||||
else: # self.sheet_id
|
|
||||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
|
||||||
|
|
||||||
def __iter__(self) -> Metadata:
|
|
||||||
sh = self.open_sheet()
|
|
||||||
for ii, worksheet in enumerate(sh.worksheets()):
|
|
||||||
if not self.should_process_sheet(worksheet.title):
|
|
||||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
|
||||||
continue
|
|
||||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
|
||||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
|
||||||
if len(missing_cols := self.missing_required_columns(gw)):
|
|
||||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# process and yield metadata here:
|
|
||||||
yield from self._process_rows(gw)
|
|
||||||
logger.success(f'Finished worksheet {worksheet.title}')
|
|
||||||
|
|
||||||
def _process_rows(self, gw: GWorksheet):
|
|
||||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
|
||||||
url = gw.get_cell(row, 'url').strip()
|
|
||||||
if not len(url): continue
|
|
||||||
original_status = gw.get_cell(row, 'status')
|
|
||||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
|
||||||
# TODO: custom status parser(?) aka should_retry_from_status
|
|
||||||
if status not in ['', None]: continue
|
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
|
||||||
m = Metadata().set_url(url)
|
|
||||||
self._set_context(m, gw, row)
|
|
||||||
yield m
|
|
||||||
|
|
||||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
|
||||||
|
|
||||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
|
||||||
|
|
||||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
|
||||||
folder = ''
|
|
||||||
else:
|
|
||||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
|
||||||
if len(folder):
|
|
||||||
if self.use_sheet_names_in_stored_paths:
|
|
||||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
|
||||||
else:
|
|
||||||
m.set_context("folder", folder)
|
|
||||||
|
|
||||||
|
|
||||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
|
||||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
|
||||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
|
||||||
return False
|
|
||||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
|
||||||
# BLOCK rules exist AND sheet name is blocked
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
|
||||||
missing = []
|
|
||||||
for required_col in ['url', 'status']:
|
|
||||||
if not gw.col_exists(required_col):
|
|
||||||
missing.append(required_col)
|
|
||||||
return missing
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
from .gworksheet import GWorksheet
|
||||||
|
from .gsheet_feeder_db import GsheetsFeederDB
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"name": "Google Sheets Feeder",
|
"name": "Google Sheets Feeder Database",
|
||||||
"type": ["feeder"],
|
"type": ["feeder", "database"],
|
||||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
"entry_point": "gsheet_feeder_db::GsheetsFeederDB",
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["loguru", "gspread", "slugify"],
|
"python": ["loguru", "gspread", "slugify"],
|
||||||
|
@ -51,10 +51,23 @@
|
||||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||||
"type": "bool",
|
"type": "bool",
|
||||||
},
|
},
|
||||||
|
"allow_worksheets": {
|
||||||
|
"default": set(),
|
||||||
|
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||||
|
},
|
||||||
|
"block_worksheets": {
|
||||||
|
"default": set(),
|
||||||
|
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||||
|
},
|
||||||
|
"use_sheet_names_in_stored_paths": {
|
||||||
|
"default": True,
|
||||||
|
"type": "bool",
|
||||||
|
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
GsheetsFeeder
|
GsheetsFeederDatabase
|
||||||
A Google Sheets-based feeder for the Auto Archiver.
|
A Google Sheets-based feeder and optional database for the Auto Archiver.
|
||||||
|
|
||||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||||
The filtered rows are processed into `Metadata` objects.
|
The filtered rows are processed into `Metadata` objects.
|
||||||
|
@ -64,11 +77,16 @@
|
||||||
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
||||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||||
|
- If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||||
|
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||||
|
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||||
|
- Skips redundant updates for empty or invalid data fields.
|
||||||
|
|
||||||
### Setup
|
### Setup
|
||||||
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
||||||
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
||||||
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
||||||
- Customize the column names in your Google sheet using the `columns` configuration.
|
- Customize the column names in your Google sheet using the `columns` configuration.
|
||||||
|
- The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
|
||||||
""",
|
""",
|
||||||
}
|
}
|
|
@ -0,0 +1,196 @@
|
||||||
|
"""
|
||||||
|
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||||
|
|
||||||
|
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||||
|
The filtered rows are processed into `Metadata` objects.
|
||||||
|
|
||||||
|
### Key properties
|
||||||
|
- validates the sheet's structure and filters rows based on input configurations.
|
||||||
|
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from typing import Tuple, Union
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
import gspread
|
||||||
|
from loguru import logger
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
|
from auto_archiver.core import Feeder, Database, Media
|
||||||
|
from auto_archiver.core import Metadata
|
||||||
|
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||||
|
from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
||||||
|
|
||||||
|
|
||||||
|
class GsheetsFeederDB(Feeder, Database):
|
||||||
|
|
||||||
|
def setup(self) -> None:
|
||||||
|
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||||
|
# TODO mv to validators
|
||||||
|
if not self.sheet and not self.sheet_id:
|
||||||
|
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||||
|
|
||||||
|
def open_sheet(self):
|
||||||
|
if self.sheet:
|
||||||
|
return self.gsheets_client.open(self.sheet)
|
||||||
|
else: # self.sheet_id
|
||||||
|
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||||
|
|
||||||
|
def __iter__(self) -> Metadata:
|
||||||
|
sh = self.open_sheet()
|
||||||
|
for ii, worksheet in enumerate(sh.worksheets()):
|
||||||
|
if not self.should_process_sheet(worksheet.title):
|
||||||
|
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||||
|
continue
|
||||||
|
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||||
|
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||||
|
if len(missing_cols := self.missing_required_columns(gw)):
|
||||||
|
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# process and yield metadata here:
|
||||||
|
yield from self._process_rows(gw)
|
||||||
|
logger.success(f'Finished worksheet {worksheet.title}')
|
||||||
|
|
||||||
|
def _process_rows(self, gw: GWorksheet):
|
||||||
|
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||||
|
url = gw.get_cell(row, 'url').strip()
|
||||||
|
if not len(url): continue
|
||||||
|
original_status = gw.get_cell(row, 'status')
|
||||||
|
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||||
|
# TODO: custom status parser(?) aka should_retry_from_status
|
||||||
|
if status not in ['', None]: continue
|
||||||
|
|
||||||
|
# All checks done - archival process starts here
|
||||||
|
m = Metadata().set_url(url)
|
||||||
|
self._set_context(m, gw, row)
|
||||||
|
yield m
|
||||||
|
|
||||||
|
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||||
|
# TODO: Check folder value not being recognised
|
||||||
|
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||||
|
|
||||||
|
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||||
|
folder = ''
|
||||||
|
else:
|
||||||
|
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||||
|
if len(folder):
|
||||||
|
if self.use_sheet_names_in_stored_paths:
|
||||||
|
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||||
|
else:
|
||||||
|
m.set_context("folder", folder)
|
||||||
|
|
||||||
|
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||||
|
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||||
|
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||||
|
return False
|
||||||
|
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||||
|
# BLOCK rules exist AND sheet name is blocked
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||||
|
missing = []
|
||||||
|
for required_col in ['url', 'status']:
|
||||||
|
if not gw.col_exists(required_col):
|
||||||
|
missing.append(required_col)
|
||||||
|
return missing
|
||||||
|
|
||||||
|
|
||||||
|
def started(self, item: Metadata) -> None:
|
||||||
|
logger.warning(f"STARTED {item}")
|
||||||
|
gw, row = self._retrieve_gsheet(item)
|
||||||
|
gw.set_cell(row, "status", "Archive in progress")
|
||||||
|
|
||||||
|
def failed(self, item: Metadata, reason: str) -> None:
|
||||||
|
logger.error(f"FAILED {item}")
|
||||||
|
self._safe_status_update(item, f"Archive failed {reason}")
|
||||||
|
|
||||||
|
def aborted(self, item: Metadata) -> None:
|
||||||
|
logger.warning(f"ABORTED {item}")
|
||||||
|
self._safe_status_update(item, "")
|
||||||
|
|
||||||
|
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||||
|
"""check if the given item has been archived already"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||||
|
"""archival result ready - should be saved to DB"""
|
||||||
|
logger.success(f"DONE {item.get_url()}")
|
||||||
|
gw, row = self._retrieve_gsheet(item)
|
||||||
|
# self._safe_status_update(item, 'done')
|
||||||
|
|
||||||
|
cell_updates = []
|
||||||
|
row_values = gw.get_row(row)
|
||||||
|
|
||||||
|
def batch_if_valid(col, val, final_value=None):
|
||||||
|
final_value = final_value or val
|
||||||
|
try:
|
||||||
|
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||||
|
cell_updates.append((row, col, final_value))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||||
|
|
||||||
|
status_message = item.status
|
||||||
|
if cached:
|
||||||
|
status_message = f"[cached] {status_message}"
|
||||||
|
cell_updates.append((row, "status", status_message))
|
||||||
|
|
||||||
|
media: Media = item.get_final_media()
|
||||||
|
if hasattr(media, "urls"):
|
||||||
|
batch_if_valid("archive", "\n".join(media.urls))
|
||||||
|
batch_if_valid("date", True, get_current_timestamp())
|
||||||
|
batch_if_valid("title", item.get_title())
|
||||||
|
batch_if_valid("text", item.get("content", ""))
|
||||||
|
batch_if_valid("timestamp", item.get_timestamp())
|
||||||
|
if media:
|
||||||
|
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||||
|
|
||||||
|
# merge all pdq hashes into a single string, if present
|
||||||
|
pdq_hashes = []
|
||||||
|
all_media = item.get_all_media()
|
||||||
|
for m in all_media:
|
||||||
|
if pdq := m.get("pdq_hash"):
|
||||||
|
pdq_hashes.append(pdq)
|
||||||
|
if len(pdq_hashes):
|
||||||
|
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||||
|
|
||||||
|
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||||
|
screenshot, "urls"
|
||||||
|
):
|
||||||
|
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||||
|
|
||||||
|
if thumbnail := item.get_first_image("thumbnail"):
|
||||||
|
if hasattr(thumbnail, "urls"):
|
||||||
|
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||||
|
|
||||||
|
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||||
|
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||||
|
batch_if_valid(
|
||||||
|
"replaywebpage",
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||||
|
for wacz in browsertrix.urls
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
gw.batch_set_cell(cell_updates)
|
||||||
|
|
||||||
|
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||||
|
try:
|
||||||
|
gw, row = self._retrieve_gsheet(item)
|
||||||
|
gw.set_cell(row, "status", new_status)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Unable to update sheet: {e}")
|
||||||
|
|
||||||
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
|
|
||||||
|
if gsheet := item.get_context("gsheet"):
|
||||||
|
gw: GWorksheet = gsheet.get("worksheet")
|
||||||
|
row: int = gsheet.get("row")
|
||||||
|
elif self.sheet_id:
|
||||||
|
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||||
|
|
||||||
|
return gw, row
|
|
@ -2,8 +2,7 @@ from datetime import datetime, timezone
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
from auto_archiver.modules.gsheet_db import GsheetsDb
|
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet
|
||||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -32,8 +31,9 @@ def mock_metadata(mocker):
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def metadata():
|
def metadata():
|
||||||
metadata = Metadata()
|
metadata = Metadata()
|
||||||
metadata.add_media(Media(filename="screenshot.png", urls=["http://example.com/screenshot.png"]).set("id", "screenshot"))
|
metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"]))
|
||||||
metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]).set("id", "browsertrix"))
|
metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]))
|
||||||
|
metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"]))
|
||||||
metadata.set_url("http://example.com")
|
metadata.set_url("http://example.com")
|
||||||
metadata.set_title("Example Title")
|
metadata.set_title("Example Title")
|
||||||
metadata.set_content("Example Content")
|
metadata.set_content("Example Content")
|
||||||
|
@ -52,12 +52,19 @@ def mock_media(mocker):
|
||||||
return mock_media
|
return mock_media
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsDb:
|
def gsheets_db(mock_gworksheet, setup_module, mocker):
|
||||||
db = setup_module("gsheet_db", {
|
mocker.patch("gspread.service_account")
|
||||||
"allow_worksheets": "set()",
|
config: dict = {
|
||||||
"block_worksheets": "set()",
|
"sheet": "testsheet",
|
||||||
"use_sheet_names_in_stored_paths": "True",
|
"sheet_id": None,
|
||||||
})
|
"header": 1,
|
||||||
|
"service_account": "test/service_account.json",
|
||||||
|
"columns": {'url': 'link', 'status': 'archive status', 'folder': 'destination folder', 'archive': 'archive location', 'date': 'archive date', 'thumbnail': 'thumbnail', 'timestamp': 'upload timestamp', 'title': 'upload title', 'text': 'text content', 'screenshot': 'screenshot', 'hash': 'hash', 'pdq_hash': 'perceptual hashes', 'wacz': 'wacz', 'replaywebpage': 'replaywebpage'},
|
||||||
|
"allow_worksheets": set(),
|
||||||
|
"block_worksheets": set(),
|
||||||
|
"use_sheet_names_in_stored_paths": True,
|
||||||
|
}
|
||||||
|
db = setup_module("gsheet_feeder_db", config)
|
||||||
db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1))
|
db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1))
|
||||||
return db
|
return db
|
||||||
|
|
||||||
|
@ -79,10 +86,10 @@ def expected_calls(mock_media, fixed_timestamp):
|
||||||
(1, 'text', 'Example Content'),
|
(1, 'text', 'Example Content'),
|
||||||
(1, 'timestamp', '2025-01-01T00:00:00+00:00'),
|
(1, 'timestamp', '2025-01-01T00:00:00+00:00'),
|
||||||
(1, 'hash', 'not-calculated'),
|
(1, 'hash', 'not-calculated'),
|
||||||
(1, 'screenshot', 'http://example.com/screenshot.png'),
|
# (1, 'screenshot', 'http://example.com/screenshot.png'),
|
||||||
(1, 'thumbnail', '=IMAGE("http://example.com/screenshot.png")'),
|
# (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'),
|
||||||
(1, 'wacz', 'http://example.com/browsertrix.wacz'),
|
# (1, 'wacz', 'http://example.com/browsertrix.wacz'),
|
||||||
(1, 'replaywebpage', 'https://replayweb.page/?source=http%3A//example.com/browsertrix.wacz#view=pages&url=http%3A//example.com')
|
# (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=')
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
|
def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
|
||||||
|
@ -107,13 +114,13 @@ def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
|
||||||
|
|
||||||
|
|
||||||
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
|
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
|
||||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||||
gsheets_db.done(metadata)
|
gsheets_db.done(metadata)
|
||||||
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
|
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
|
||||||
|
|
||||||
|
|
||||||
def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
|
def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||||
gsheets_db.done(metadata, cached=True)
|
gsheets_db.done(metadata, cached=True)
|
||||||
|
|
||||||
# Verify the status message includes "[cached]"
|
# Verify the status message includes "[cached]"
|
||||||
|
@ -124,7 +131,7 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||||
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
|
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||||
# clear media from metadata
|
# clear media from metadata
|
||||||
metadata.media = []
|
metadata.media = []
|
||||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||||
gsheets_db.done(metadata)
|
gsheets_db.done(metadata)
|
||||||
# Verify nothing media-related gets updated
|
# Verify nothing media-related gets updated
|
||||||
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
|
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import Type
|
||||||
|
|
||||||
import gspread
|
import gspread
|
||||||
import pytest
|
import pytest
|
||||||
from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
|
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB
|
||||||
from auto_archiver.core import Metadata, Feeder
|
from auto_archiver.core import Metadata, Feeder
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,13 +11,13 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
|
||||||
mocker.patch("gspread.service_account")
|
mocker.patch("gspread.service_account")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
setup_module(
|
setup_module(
|
||||||
"gsheet_feeder",
|
"gsheet_feeder_db",
|
||||||
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
|
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
|
def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB:
|
||||||
config: dict = {
|
config: dict = {
|
||||||
"service_account": "dummy.json",
|
"service_account": "dummy.json",
|
||||||
"sheet": "test-auto-archiver",
|
"sheet": "test-auto-archiver",
|
||||||
|
@ -45,7 +45,7 @@ def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
|
||||||
}
|
}
|
||||||
mocker.patch("gspread.service_account")
|
mocker.patch("gspread.service_account")
|
||||||
feeder = setup_module(
|
feeder = setup_module(
|
||||||
"gsheet_feeder",
|
"gsheet_feeder_db",
|
||||||
config
|
config
|
||||||
)
|
)
|
||||||
feeder.gsheets_client = mocker.MagicMock()
|
feeder.gsheets_client = mocker.MagicMock()
|
||||||
|
@ -90,7 +90,7 @@ class MockWorksheet:
|
||||||
return matching.get(col_name, default)
|
return matching.get(col_name, default)
|
||||||
|
|
||||||
|
|
||||||
def test__process_rows(gsheet_feeder: GsheetsFeeder):
|
def test__process_rows(gsheet_feeder: GsheetsFeederDB):
|
||||||
testworksheet = MockWorksheet()
|
testworksheet = MockWorksheet()
|
||||||
metadata_items = list(gsheet_feeder._process_rows(testworksheet))
|
metadata_items = list(gsheet_feeder._process_rows(testworksheet))
|
||||||
assert len(metadata_items) == 3
|
assert len(metadata_items) == 3
|
||||||
|
@ -98,7 +98,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
|
||||||
assert metadata_items[0].get("url") == "http://example.com"
|
assert metadata_items[0].get("url") == "http://example.com"
|
||||||
|
|
||||||
|
|
||||||
def test__set_metadata(gsheet_feeder: GsheetsFeeder):
|
def test__set_metadata(gsheet_feeder: GsheetsFeederDB):
|
||||||
worksheet = MockWorksheet()
|
worksheet = MockWorksheet()
|
||||||
metadata = Metadata()
|
metadata = Metadata()
|
||||||
gsheet_feeder._set_context(metadata, worksheet, 1)
|
gsheet_feeder._set_context(metadata, worksheet, 1)
|
||||||
|
@ -106,12 +106,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Not recognising folder column")
|
@pytest.mark.skip(reason="Not recognising folder column")
|
||||||
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet):
|
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet):
|
||||||
gsheet_feeder._set_context(worksheet, 7)
|
gsheet_feeder._set_context(worksheet, 7)
|
||||||
assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
|
assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
|
||||||
|
|
||||||
|
|
||||||
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
|
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB):
|
||||||
testworksheet = MockWorksheet()
|
testworksheet = MockWorksheet()
|
||||||
metadata = Metadata()
|
metadata = Metadata()
|
||||||
testworksheet.wks.title = "TestSheet"
|
testworksheet.wks.title = "TestSheet"
|
||||||
|
@ -140,7 +140,7 @@ def test_open_sheet_with_name_or_id(
|
||||||
|
|
||||||
# Setup module with parameterized values
|
# Setup module with parameterized values
|
||||||
feeder = setup_module(
|
feeder = setup_module(
|
||||||
"gsheet_feeder",
|
"gsheet_feeder_db",
|
||||||
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
|
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
|
||||||
)
|
)
|
||||||
sheet_result = feeder.open_sheet()
|
sheet_result = feeder.open_sheet()
|
||||||
|
@ -159,7 +159,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
|
||||||
mock_service_account.return_value = mock_client
|
mock_service_account.return_value = mock_client
|
||||||
mock_client.open_by_key.return_value = "MockSheet"
|
mock_client.open_by_key.return_value = "MockSheet"
|
||||||
feeder = setup_module(
|
feeder = setup_module(
|
||||||
"gsheet_feeder",
|
"gsheet_feeder_db",
|
||||||
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
|
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
|
||||||
)
|
)
|
||||||
sheet = feeder.open_sheet()
|
sheet = feeder.open_sheet()
|
||||||
|
@ -170,7 +170,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
|
||||||
def test_should_process_sheet(setup_module, mocker):
|
def test_should_process_sheet(setup_module, mocker):
|
||||||
mocker.patch("gspread.service_account")
|
mocker.patch("gspread.service_account")
|
||||||
gdb = setup_module(
|
gdb = setup_module(
|
||||||
"gsheet_feeder",
|
"gsheet_feeder_db",
|
||||||
{
|
{
|
||||||
"service_account": "dummy.json",
|
"service_account": "dummy.json",
|
||||||
"sheet": "TestSheet",
|
"sheet": "TestSheet",
|
||||||
|
@ -187,10 +187,10 @@ def test_should_process_sheet(setup_module, mocker):
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Requires a real connection")
|
@pytest.mark.skip(reason="Requires a real connection")
|
||||||
class TestGSheetsFeederReal:
|
class TestGSheetsFeederReal:
|
||||||
"""Testing GSheetsFeeder class"""
|
"""Testing GsheetsFeeder class"""
|
||||||
|
|
||||||
module_name: str = "gsheet_feeder"
|
module_name: str = "gsheet_feeder_db"
|
||||||
feeder: GsheetsFeeder
|
feeder: GsheetsFeederDB
|
||||||
# You must follow the setup process explain in the docs for this to work
|
# You must follow the setup process explain in the docs for this to work
|
||||||
config: dict = {
|
config: dict = {
|
||||||
"service_account": "secrets/service_account.json",
|
"service_account": "secrets/service_account.json",
|
||||||
|
|
Ładowanie…
Reference in New Issue