kopia lustrzana https://github.com/bellingcat/auto-archiver
s3 storaging + WIP gsheets DB
rodzic
bb512b36c9
commit
1cdc006b27
|
@ -90,11 +90,13 @@ class ConfigV2:
|
||||||
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
||||||
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
|
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
|
||||||
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
||||||
|
self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])]
|
||||||
|
|
||||||
print("feeder", self.feeder)
|
print("feeder", self.feeder)
|
||||||
print("enrichers", [e for e in self.enrichers])
|
print("enrichers", [e for e in self.enrichers])
|
||||||
print("archivers", [e for e in self.archivers])
|
print("archivers", [e for e in self.archivers])
|
||||||
print("databases", [e for e in self.databases])
|
print("databases", [e for e in self.databases])
|
||||||
|
print("storages", [e for e in self.storages])
|
||||||
|
|
||||||
def validate(self):
|
def validate(self):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Union, Tuple
|
from typing import Union, Tuple
|
||||||
import gspread
|
import gspread, datetime
|
||||||
|
|
||||||
# from metadata import Metadata
|
# from metadata import Metadata
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
@ -7,6 +7,7 @@ from loguru import logger
|
||||||
# from . import Enricher
|
# from . import Enricher
|
||||||
from databases import Database
|
from databases import Database
|
||||||
from metadata import Metadata
|
from metadata import Metadata
|
||||||
|
from media import Media
|
||||||
from steps.gsheet import Gsheets
|
from steps.gsheet import Gsheets
|
||||||
from utils import GWorksheet
|
from utils import GWorksheet
|
||||||
|
|
||||||
|
@ -48,8 +49,37 @@ class GsheetsDb(Database):
|
||||||
"""archival result ready - should be saved to DB"""
|
"""archival result ready - should be saved to DB"""
|
||||||
logger.success(f"DONE {item}")
|
logger.success(f"DONE {item}")
|
||||||
gw, row = self._retrieve_gsheet(item)
|
gw, row = self._retrieve_gsheet(item)
|
||||||
self._safe_status_update(item, 'done')
|
# self._safe_status_update(item, 'done')
|
||||||
pass
|
|
||||||
|
cell_updates = []
|
||||||
|
row_values = gw.get_row(row)
|
||||||
|
|
||||||
|
def batch_if_valid(col, val, final_value=None):
|
||||||
|
final_value = final_value or val
|
||||||
|
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
||||||
|
cell_updates.append((row, col, final_value))
|
||||||
|
|
||||||
|
cell_updates.append((row, 'status', item.status))
|
||||||
|
|
||||||
|
media: Media = item.get_single_media()
|
||||||
|
|
||||||
|
batch_if_valid('archive', media.cdn_url)
|
||||||
|
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||||
|
batch_if_valid('title', item.get_title())
|
||||||
|
batch_if_valid('text', item.get("content", "")[:500])
|
||||||
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
|
|
||||||
|
# TODO: AFTER ENRICHMENTS
|
||||||
|
# batch_if_valid('hash', media.hash)
|
||||||
|
# batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
||||||
|
# batch_if_valid('thumbnail_index', result.thumbnail_index)
|
||||||
|
# batch_if_valid('duration', result.duration, str(result.duration))
|
||||||
|
# batch_if_valid('screenshot', result.screenshot)
|
||||||
|
# if result.wacz is not None:
|
||||||
|
# batch_if_valid('wacz', result.wacz)
|
||||||
|
# batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
||||||
|
|
||||||
|
gw.batch_set_cell(cell_updates)
|
||||||
|
|
||||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import gspread
|
import gspread, os
|
||||||
|
|
||||||
# from metadata import Metadata
|
# from metadata import Metadata
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
@ -8,7 +8,7 @@ from feeders import Feeder
|
||||||
from metadata import Metadata
|
from metadata import Metadata
|
||||||
from steps.gsheet import Gsheets
|
from steps.gsheet import Gsheets
|
||||||
from utils import GWorksheet
|
from utils import GWorksheet
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
class GsheetsFeeder(Gsheets, Feeder):
|
class GsheetsFeeder(Gsheets, Feeder):
|
||||||
name = "gsheet_feeder"
|
name = "gsheet_feeder"
|
||||||
|
@ -60,7 +60,7 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||||
if status not in ['', None]: continue
|
if status not in ['', None]: continue
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
|
yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||||
|
|
||||||
logger.success(f'Finished worksheet {wks.title}')
|
logger.success(f'Finished worksheet {wks.title}')
|
||||||
|
|
||||||
|
|
|
@ -10,8 +10,7 @@ import json
|
||||||
@dataclass
|
@dataclass
|
||||||
class Media:
|
class Media:
|
||||||
filename: str
|
filename: str
|
||||||
id: str = None
|
key: str = None
|
||||||
hash: str = None
|
|
||||||
cdn_url: str = None
|
cdn_url: str = None
|
||||||
hash: str = None
|
# id: str = None
|
||||||
|
# hash: str = None # TODO: added by enrichers
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||||
from ast import List, Set
|
from ast import List, Set
|
||||||
from typing import Any, Union, Dict
|
from typing import Any, Union, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
import datetime
|
||||||
# import json
|
# import json
|
||||||
|
|
||||||
from media import Media
|
from media import Media
|
||||||
|
@ -70,26 +70,40 @@ class Metadata:
|
||||||
def set_title(self, title: str) -> Metadata:
|
def set_title(self, title: str) -> Metadata:
|
||||||
return self.set("title", title)
|
return self.set("title", title)
|
||||||
|
|
||||||
def set_timestamp(self, timestamp: datetime) -> Metadata:
|
def get_title(self) -> str:
|
||||||
assert type(timestamp) == datetime, "set_timestamp expects a datetime instance"
|
return self.get("title")
|
||||||
|
|
||||||
|
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
||||||
|
assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance"
|
||||||
return self.set("timestamp", timestamp)
|
return self.set("timestamp", timestamp)
|
||||||
|
|
||||||
|
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
|
||||||
|
ts = self.get("timestamp")
|
||||||
|
if not ts: return ts
|
||||||
|
if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
|
||||||
|
if iso: return ts.isoformat()
|
||||||
|
return ts
|
||||||
|
|
||||||
def add_media(self, media: Media) -> Metadata:
|
def add_media(self, media: Media) -> Metadata:
|
||||||
# print(f"adding {filename} to {self.metadata.get('media')}")
|
# print(f"adding {filename} to {self.metadata.get('media')}")
|
||||||
# return self.set("media", self.get_media() + [filename])
|
# return self.set("media", self.get_media() + [filename])
|
||||||
# return self.get_media().append(media)
|
# return self.get_media().append(media)
|
||||||
return self.media.append(media)
|
return self.media.append(media)
|
||||||
|
|
||||||
|
def get_single_media(self) -> Media:
|
||||||
|
# TODO: check if formatters were applied and choose with priority
|
||||||
|
return self.media[0]
|
||||||
|
|
||||||
# def as_json(self) -> str:
|
# def as_json(self) -> str:
|
||||||
# # converts all metadata and data into JSON
|
# # converts all metadata and data into JSON
|
||||||
# return json.dumps(self.metadata)
|
# return json.dumps(self.metadata)
|
||||||
# #TODO: datetime is not serializable
|
# #TODO: datetime is not serializable
|
||||||
|
|
||||||
def cleanup(self) -> Metadata:
|
def cleanup(self) -> Metadata:
|
||||||
#TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
|
# TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
|
||||||
# the code below leads to errors if database needs tmp_keys after they are removed
|
# the code below leads to errors if database needs tmp_keys after they are removed
|
||||||
# """removes temporary metadata fields, ideally called after all ops except writing"""
|
# """removes temporary metadata fields, ideally called after all ops except writing"""
|
||||||
# for tmp_key in self.tmp_keys:
|
# for tmp_key in self.tmp_keys:
|
||||||
# self.metadata.pop(tmp_key, None)
|
# self.metadata.pop(tmp_key, None)
|
||||||
# self.tmp_keys = set()
|
# self.tmp_keys = set()
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -2,15 +2,18 @@ from __future__ import annotations
|
||||||
from ast import List
|
from ast import List
|
||||||
from typing import Union, Dict
|
from typing import Union, Dict
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from archivers.archiver import Archiverv2
|
|
||||||
|
|
||||||
from enrichers.enricher import Enricher
|
from archivers import Archiverv2
|
||||||
from databases.database import Database
|
from storages import StorageV2
|
||||||
|
from enrichers import Enricher
|
||||||
|
from databases import Database
|
||||||
from metadata import Metadata
|
from metadata import Metadata
|
||||||
|
|
||||||
import tempfile, time, traceback
|
import tempfile, time, traceback
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
how not to couple the different pieces of logic
|
how not to couple the different pieces of logic
|
||||||
due to the use of constants for the metadata keys?
|
due to the use of constants for the metadata keys?
|
||||||
|
@ -133,11 +136,11 @@ class ArchivingOrchestrator:
|
||||||
self.enrichers = config.enrichers
|
self.enrichers = config.enrichers
|
||||||
self.archivers: List[Archiverv2] = config.archivers
|
self.archivers: List[Archiverv2] = config.archivers
|
||||||
self.databases: List[Database] = config.databases
|
self.databases: List[Database] = config.databases
|
||||||
|
self.storages: List[StorageV2] = config.storages
|
||||||
|
|
||||||
for a in self.archivers: a.setup()
|
for a in self.archivers: a.setup()
|
||||||
|
|
||||||
self.formatters = []
|
self.formatters = []
|
||||||
self.storages = []
|
|
||||||
# self.formatters = [
|
# self.formatters = [
|
||||||
# Formatter.init(f, config)
|
# Formatter.init(f, config)
|
||||||
# for f in config.formatters
|
# for f in config.formatters
|
||||||
|
@ -184,7 +187,7 @@ class ArchivingOrchestrator:
|
||||||
|
|
||||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||||
url = result.get_url()
|
url = result.get_url()
|
||||||
# TODO: clean urls
|
# TODO: clean urls
|
||||||
for a in self.archivers:
|
for a in self.archivers:
|
||||||
url = a.clean_url(url)
|
url = a.clean_url(url)
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
|
@ -240,8 +243,8 @@ class ArchivingOrchestrator:
|
||||||
|
|
||||||
# storage
|
# storage
|
||||||
for s in self.storages:
|
for s in self.storages:
|
||||||
for m in result.media:
|
for i, m in enumerate(result.media):
|
||||||
result.merge(s.store(m))
|
result.media[i] = s.store(m, result)
|
||||||
|
|
||||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||||
# a hash registration service could be one database: forensic archiving
|
# a hash registration service could be one database: forensic archiving
|
||||||
|
|
|
@ -30,6 +30,7 @@ class Gsheets(Step):
|
||||||
'thumbnail_index': 'thumbnail index',
|
'thumbnail_index': 'thumbnail index',
|
||||||
'timestamp': 'upload timestamp',
|
'timestamp': 'upload timestamp',
|
||||||
'title': 'upload title',
|
'title': 'upload title',
|
||||||
|
'text': 'text content',
|
||||||
'duration': 'duration',
|
'duration': 'duration',
|
||||||
'screenshot': 'screenshot',
|
'screenshot': 'screenshot',
|
||||||
'hash': 'hash',
|
'hash': 'hash',
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
|
||||||
from typing import IO
|
from typing import IO, Any
|
||||||
import boto3, uuid, os, mimetypes
|
import boto3, uuid, os, mimetypes
|
||||||
from botocore.errorfactory import ClientError
|
from botocore.errorfactory import ClientError
|
||||||
from src.storages import StorageV2
|
from metadata import Metadata
|
||||||
|
from media import Media
|
||||||
|
from storages import StorageV2
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
|
@ -14,10 +16,10 @@ class S3StorageV2(StorageV2):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.s3 = boto3.client(
|
self.s3 = boto3.client(
|
||||||
's3',
|
's3',
|
||||||
region_name=config.region,
|
region_name=self.region,
|
||||||
endpoint_url=config.endpoint_url.format(region=config.region),
|
endpoint_url=self.endpoint_url.format(region=self.region),
|
||||||
aws_access_key_id=config.key,
|
aws_access_key_id=self.key,
|
||||||
aws_secret_access_key=config.secret
|
aws_secret_access_key=self.secret
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -37,31 +39,31 @@ class S3StorageV2(StorageV2):
|
||||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||||
},
|
},
|
||||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||||
"key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
|
# "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_cdn_url(self, key: str) -> str:
|
def get_cdn_url(self, media: Media) -> str:
|
||||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
|
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||||
|
|
||||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> None:
|
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
|
||||||
extra_args = kwargs.get("extra_args", {})
|
extra_args = kwargs.get("extra_args", {})
|
||||||
if not self.private and 'ACL' not in extra_args:
|
if not self.private and 'ACL' not in extra_args:
|
||||||
extra_args['ACL'] = 'public-read'
|
extra_args['ACL'] = 'public-read'
|
||||||
|
|
||||||
if 'ContentType' not in extra_args:
|
if 'ContentType' not in extra_args:
|
||||||
try:
|
try:
|
||||||
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unable to get mimetype for {key=}, error: {e}")
|
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||||
|
|
||||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||||
|
|
||||||
def exists(self, key: str) -> bool:
|
def exists(self, key: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Tests if a given file with key=key exists in the bucket
|
Tests if a given file with key=key exists in the bucket
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
|
self.s3.head_object(Bucket=self.bucket, Key=key)
|
||||||
return True
|
return True
|
||||||
except ClientError as e:
|
except ClientError as e:
|
||||||
logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
|
logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import IO, Any
|
||||||
|
from media import Media
|
||||||
from metadata import Metadata
|
from metadata import Metadata
|
||||||
from steps.step import Step
|
from steps.step import Step
|
||||||
|
from loguru import logger
|
||||||
|
import os, uuid
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -17,5 +21,23 @@ class StorageV2(Step):
|
||||||
def init(name: str, config: dict) -> StorageV2:
|
def init(name: str, config: dict) -> StorageV2:
|
||||||
return Step.init(name, config, StorageV2)
|
return Step.init(name, config, StorageV2)
|
||||||
|
|
||||||
|
def store(self, media: Media, item: Metadata) -> Media:
|
||||||
|
media = self.set_key(media, item)
|
||||||
|
self.upload(media)
|
||||||
|
media.cdn_url = self.get_cdn_url(media)
|
||||||
|
return media
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def store(self, item: Metadata) -> Metadata: pass
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
|
||||||
|
|
||||||
|
def upload(self, media: Media, **kwargs) -> Any:
|
||||||
|
logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
|
||||||
|
with open(media.filename, 'rb') as f:
|
||||||
|
return self.uploadf(f, media, **kwargs)
|
||||||
|
|
||||||
|
def set_key(self, media: Media, item: Metadata) -> Media:
|
||||||
|
"""takes the media and optionally item info and generates a key"""
|
||||||
|
folder = item.get("folder", "")
|
||||||
|
ext = os.path.splitext(media.filename)[1]
|
||||||
|
media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
|
||||||
|
return media
|
||||||
|
|
Ładowanie…
Reference in New Issue