kopia lustrzana https://github.com/bellingcat/auto-archiver
refactoring storages
rodzic
10f03cb888
commit
d33daabee1
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
|
||||
class Storage(ABC):
|
||||
TMP_FOLDER = "tmp/"
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, config): pass
|
||||
|
||||
|
@ -28,18 +29,18 @@ class Storage(ABC):
|
|||
and others not, but that all can call
|
||||
"""
|
||||
for k, v in kwargs.items():
|
||||
if k in self.get_allowed_properties():
|
||||
if k in self._get_allowed_properties():
|
||||
setattr(self, k, v)
|
||||
else:
|
||||
logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
|
||||
|
||||
def get_allowed_properties(self):
|
||||
def _get_allowed_properties(self):
|
||||
"""
|
||||
child classes should specify which properties they allow to be set
|
||||
"""
|
||||
return set(["subfolder"])
|
||||
|
||||
def clean_path(self, folder, default="", add_forward_slash=True):
|
||||
def _clean_path(self, folder, default="", add_forward_slash=True):
|
||||
if folder is None or type(folder) != str or len(folder.strip()) == 0:
|
||||
return default
|
||||
return str(Path(folder)) + ("/" if add_forward_slash else "")
|
||||
|
|
|
@ -12,15 +12,14 @@ import time
|
|||
@dataclass
|
||||
class GDConfig:
|
||||
root_folder_id: str
|
||||
default_upload_folder_name: str = "default"
|
||||
|
||||
|
||||
class GDStorage(Storage):
|
||||
DEFAULT_UPLOAD_FOLDER_NAME = "default"
|
||||
|
||||
def __init__(self, config: GDConfig):
|
||||
self.default_upload_folder_name = config.default_upload_folder_name
|
||||
self.root_folder_id = config.root_folder_id
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
|
||||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=['https://www.googleapis.com/auth/drive'])
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
|
@ -28,150 +27,54 @@ class GDStorage(Storage):
|
|||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
self.subfolder = self._clean_path(self.subfolder, self.default_upload_folder_name, False)
|
||||
filename = key
|
||||
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
|
||||
|
||||
# retry policy on Google Drive
|
||||
try_again = True
|
||||
counter = 1
|
||||
folder_id = None
|
||||
while try_again:
|
||||
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
|
||||
results = self.service.files().list(
|
||||
q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
folder_id = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, 5, 10)
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"found folder of {item['name']}")
|
||||
folder_id = item['id']
|
||||
try_again = False
|
||||
|
||||
if folder_id is None:
|
||||
logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}')
|
||||
counter += 1
|
||||
time.sleep(10)
|
||||
if counter > 18:
|
||||
raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes')
|
||||
|
||||
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
|
||||
# happens doing thumbnails
|
||||
# check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails
|
||||
# a='youtube_dl_abcde', b='index.html'
|
||||
a, _, b = filename.partition('/')
|
||||
|
||||
if b != '':
|
||||
# a: 'youtube_dl_sDE-qZdi8p8'
|
||||
# b: 'index.html'
|
||||
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
|
||||
|
||||
# get id of the sub folder
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
filename = None
|
||||
for item in items:
|
||||
folder_id = item['id']
|
||||
filename = b
|
||||
if filename is None:
|
||||
raise ValueError(f'Problem finding sub folder {a}')
|
||||
logger.debug(f'get_cdn_url: Found a subfolder so need to split on: {a=} and {b=}')
|
||||
folder_id = self._get_id_from_parent_and_name(folder_id, a, use_mime_type=True)
|
||||
filename = b
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id}' in parents and name = '{filename}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
file_id = None
|
||||
for item in items:
|
||||
logger.debug(f"found file of {item['name']}")
|
||||
file_id = item['id']
|
||||
|
||||
if file_id is None:
|
||||
raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
|
||||
|
||||
foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
|
||||
return foo
|
||||
file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
def exists(self, _key):
|
||||
# TODO: How to check for google drive, as it accepts different names
|
||||
# TODO: How to check for google drive, as it accepts different names?
|
||||
return False
|
||||
|
||||
def uploadf(self, file, key, **_kwargs):
|
||||
logger.debug(f"before {self.subfolder=}")
|
||||
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
"""
|
||||
1. check if subfolder exists or create it
|
||||
2. check if key contains sub-subfolder, check if exists or create it
|
||||
3. upload file to root_id/subfolder[/sub-subfolder]/filename
|
||||
"""
|
||||
self.subfolder = self._clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
filename = key
|
||||
logger.debug(f"after {self.subfolder=}")
|
||||
# does folder eg SM0005 exist already inside parent of Files auto-archiver
|
||||
results = self.service.files().list(
|
||||
q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway')
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"Found existing folder of {item['name']}")
|
||||
folder_id_to_upload_to = item['id']
|
||||
|
||||
# get id of subfolder or create if it does not exist
|
||||
folder_id_to_upload_to = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, use_mime_type=True, raise_on_missing=False)
|
||||
if folder_id_to_upload_to is None:
|
||||
logger.debug(f'Creating new folder {self.subfolder}')
|
||||
file_metadata = {
|
||||
'name': [self.subfolder],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [self.root_folder_id]
|
||||
}
|
||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
folder_id_to_upload_to = gd_file.get('id')
|
||||
folder_id_to_upload_to = self._mkdir(self.subfolder, self.root_folder_id)
|
||||
|
||||
# check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails
|
||||
# will always return a and a blank b even if there is nothing to split
|
||||
# https://stackoverflow.com/a/38149500/26086
|
||||
# check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails
|
||||
# a='youtube_dl_abcde', b='index.html'
|
||||
a, _, b = filename.partition('/')
|
||||
|
||||
if b != '':
|
||||
# a: 'youtube_dl_sDE-qZdi8p8'
|
||||
# b: 'out1.jpg'
|
||||
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
|
||||
|
||||
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
sub_folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
logger.error(f'Duplicate folder name of {a} which should never happen')
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"Found existing folder of {item['name']}")
|
||||
sub_folder_id_to_upload_to = item['id']
|
||||
|
||||
logger.debug(f'uploadf: Found a subfolder so need to split on: {a=} and {b=}')
|
||||
# get id of subfolder or create if it does not exist
|
||||
sub_folder_id_to_upload_to = self._get_id_from_parent_and_name(folder_id_to_upload_to, a, use_mime_type=True, raise_on_missing=False)
|
||||
if sub_folder_id_to_upload_to is None:
|
||||
# create new folder
|
||||
file_metadata = {
|
||||
'name': [a],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [folder_id_to_upload_to]
|
||||
}
|
||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
sub_folder_id_to_upload_to = gd_file.get('id')
|
||||
sub_folder_id_to_upload_to = self._mkdir(a, folder_id_to_upload_to)
|
||||
|
||||
filename = b
|
||||
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
||||
# back to normal control flow
|
||||
|
||||
# upload file to gd
|
||||
file_metadata = {
|
||||
|
@ -180,8 +83,55 @@ class GDStorage(Storage):
|
|||
}
|
||||
media = MediaFileUpload(file, resumable=True)
|
||||
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={folder_id_to_upload_to}')
|
||||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
# GD only requires the filename not a file reader
|
||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
self.uploadf(filename, key, **kwargs)
|
||||
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True):
|
||||
"""
|
||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||
If @raise_on_missing will throw error when not found, or returns None
|
||||
Returns the id of the file or folder from its name as a string
|
||||
"""
|
||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||
query_string = f"'{parent_id}' in parents and name = '{name}' "
|
||||
if use_mime_type:
|
||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||
|
||||
for attempt in range(retries):
|
||||
results = self.service.files().list(
|
||||
q=query_string,
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
if len(items) > 0:
|
||||
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
||||
return items[-1]['id']
|
||||
else:
|
||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}. sleeping for {sleep_seconds} second(s)')
|
||||
if attempt < retries - 1: time.sleep(sleep_seconds)
|
||||
|
||||
if raise_on_missing:
|
||||
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
||||
return None
|
||||
|
||||
def _mkdir(self, name: str, parent_id: str):
|
||||
"""
|
||||
Creates a new GDrive folder @name inside folder @parent_id
|
||||
Returns id of the created folder
|
||||
"""
|
||||
logger.debug(f'[_mkdir] Creating new folder with {name=} inside {parent_id=}')
|
||||
file_metadata = {
|
||||
'name': [name],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [parent_id]
|
||||
}
|
||||
gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
return gd_folder.get('id')
|
||||
|
|
|
@ -4,12 +4,10 @@ from .base_storage import Storage
|
|||
|
||||
class LocalStorage(Storage):
|
||||
def __init__(self, folder):
|
||||
self.folder = folder
|
||||
if len(self.folder) and self.folder[-1] != '/':
|
||||
self.folder += '/'
|
||||
self.folder = self._clean_path(folder)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
return self.folder + key
|
||||
return self.folder + self._clean_path(self.subfolder) + key
|
||||
|
||||
def exists(self, key):
|
||||
return os.path.isfile(self.get_cdn_url(key))
|
||||
|
|
|
@ -19,8 +19,8 @@ class S3Config:
|
|||
endpoint_url: str = "https://{region}.digitaloceanspaces.com"
|
||||
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
private: bool = False
|
||||
key_path: str = "default"
|
||||
no_folder: bool = False # when true folders are not used for url path
|
||||
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
|
||||
no_folder: bool = False # when true folders are not used for url path
|
||||
|
||||
|
||||
class S3Storage(Storage):
|
||||
|
@ -28,7 +28,7 @@ class S3Storage(Storage):
|
|||
def __init__(self, config: S3Config):
|
||||
self.bucket = config.bucket
|
||||
self.region = config.region
|
||||
self.folder = self.clean_path(config.folder)
|
||||
self.folder = self._clean_path(config.folder)
|
||||
self.private = config.private
|
||||
self.cdn_url = config.cdn_url
|
||||
self.key_path = config.key_path
|
||||
|
@ -54,8 +54,7 @@ class S3Storage(Storage):
|
|||
ext = os.path.splitext(key)[1]
|
||||
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
|
||||
final_key = self.key_dict[key]
|
||||
return self.folder + final_key
|
||||
return self.folder + self.clean_path(self.subfolder) + key
|
||||
return self.folder + self._clean_path(self.subfolder) + final_key
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
|
||||
|
|
Ładowanie…
Reference in New Issue