kopia lustrzana https://github.com/bellingcat/auto-archiver
				
				
				
			
		
			
				
	
	
		
			234 wiersze
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			234 wiersze
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Python
		
	
	
| # import boto3
 | |
| # from botocore.errorfactory import ClientError
 | |
| from loguru import logger
 | |
| from .base_storage import Storage
 | |
| from dataclasses import dataclass
 | |
| 
 | |
| from googleapiclient.discovery import build
 | |
| from googleapiclient.errors import HttpError
 | |
| from googleapiclient.http import MediaFileUpload
 | |
| from google.oauth2 import service_account
 | |
| 
 | |
| import time
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class GDConfig:
 | |
|     root_folder_id: str
 | |
|     bucket: str
 | |
|     region: str
 | |
|     key: str
 | |
|     secret: str
 | |
|     folder: str = ""
 | |
|     private: bool = False
 | |
| 
 | |
| 
 | |
| class GDStorage(Storage):
 | |
| 
 | |
|     def __init__(self, config: GDConfig):
 | |
|         self.root_folder_id = config.root_folder_id
 | |
| 
 | |
|         self.bucket = config.bucket
 | |
|         self.region = config.region
 | |
|         self.folder = config.folder
 | |
|         self.private = config.private
 | |
| 
 | |
|         SCOPES = ['https://www.googleapis.com/auth/drive']
 | |
| 
 | |
|         creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
 | |
| 
 | |
|         self.service = build('drive', 'v3', credentials=creds)
 | |
| 
 | |
|             # if len(self.folder) and self.folder[-1] != '/':
 | |
|         #     self.folder += '/'
 | |
| 
 | |
|         # self.s3 = boto3.client(
 | |
|         #     's3',
 | |
|         #     region_name=self.region,
 | |
|         #     endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
 | |
|         #     aws_access_key_id=config.key,
 | |
|         #     aws_secret_access_key=config.secret
 | |
|         # )
 | |
| 
 | |
|     def _get_path(self, key):
 | |
|         return self.folder + key
 | |
| 
 | |
|     def get_cdn_url(self, key):
 | |
|         # only support files saved in a folders for GD
 | |
|         # key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg
 | |
|         directory = key.split('/', 1)[0]
 | |
|         # eg twitter__media_asdf.jpg
 | |
|         filename = key.split('/', 1)[1]
 | |
| 
 | |
|         logger.debug(f'Looking for {directory} and filename: {filename} on GD')
 | |
| 
 | |
|         try_again = True
 | |
|         counter = 1
 | |
|         folder_id = None
 | |
|         while try_again:
 | |
|             # need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
 | |
|             results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
 | |
|                                                 and name = '{directory}' ",
 | |
|                                             spaces='drive', # ie not appDataFolder or photos
 | |
|                                             fields='files(id, name)'
 | |
|                                             ).execute()
 | |
|             items = results.get('files', [])
 | |
| 
 | |
|             for item in items:
 | |
|                 logger.debug(f"found folder of {item['name']}")
 | |
|                 folder_id= item['id']
 | |
|                 try_again = False
 | |
| 
 | |
|             if folder_id is None:
 | |
|                 logger.warning(f'Cant find folder {directory} waiting and trying again count {counter}')
 | |
|                 counter += 1
 | |
|                 time.sleep(10)
 | |
|                 if counter > 18:
 | |
|                     raise ValueError(f'Cant find folder {directory} and retried 18 times pausing 10seconds at a time which is 3 minutes')
 | |
| 
 | |
|         # check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
 | |
|         # happens doing thumbnails
 | |
| 
 | |
|         a, _, b = filename.partition('/')
 | |
| 
 | |
|         if b != '':
 | |
|             # a: 'youtube_dl_sDE-qZdi8p8'
 | |
|             # b: 'index.html'
 | |
|             logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
 | |
| 
 | |
|             # get id of the sub folder
 | |
|             results = self.service.files().list(q=f"'{folder_id}' in parents \
 | |
|                                                 and mimeType='application/vnd.google-apps.folder' \
 | |
|                                                 and name = '{a}' ",
 | |
|                                             spaces='drive', # ie not appDataFolder or photos
 | |
|                                             fields='files(id, name)'
 | |
|                                             ).execute()
 | |
|             items = results.get('files', [])
 | |
| 
 | |
|             filename = None
 | |
|             for item in items:
 | |
|                 folder_id = item['id']
 | |
|                 filename = b
 | |
|             if filename is None:
 | |
|                 raise ValueError(f'Problem finding sub folder {a}')
 | |
| 
 | |
| 
 | |
|         # get id of file inside folder (or sub folder)
 | |
|         results = self.service.files().list(q=f"'{folder_id}' in parents \
 | |
|                                             and name = '{filename}' ",
 | |
|                                         spaces='drive', 
 | |
|                                         fields='files(id, name)'
 | |
|                                         ).execute()
 | |
|         items = results.get('files', [])
 | |
|         
 | |
|         file_id = None
 | |
|         for item in items:
 | |
|             logger.debug(f"found file of {item['name']}")
 | |
|             file_id= item['id']
 | |
| 
 | |
|         if file_id is None:
 | |
|             raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
 | |
|             
 | |
|         foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
 | |
|         return foo
 | |
| 
 | |
|     def exists(self, key):
 | |
|         # try:
 | |
|         #     self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
 | |
|         #     return True
 | |
|         # except ClientError:
 | |
|         #     return False
 | |
|         return False
 | |
| 
 | |
|     def uploadf(self, file, key, **kwargs):
 | |
|         # if self.private:
 | |
|         #     extra_args = kwargs.get("extra_args", {})
 | |
|         # else:
 | |
|         #     extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
 | |
| 
 | |
|         # split on first occurance of /
 | |
|         # eg SM0005
 | |
|         foldername = key.split('/', 1)[0]
 | |
|         # eg twitter__media_asdf.jpg
 | |
|         filename = key.split('/', 1)[1]
 | |
| 
 | |
|          # does folder eg SM0005 exist already inside parent of Files auto-archiver
 | |
|         results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
 | |
|                                             and mimeType='application/vnd.google-apps.folder' \
 | |
|                                             and name = '{foldername}' ",
 | |
|                                         spaces='drive', 
 | |
|                                         fields='files(id, name)'
 | |
|                                         ).execute()
 | |
|         items = results.get('files', [])
 | |
|         folder_id_to_upload_to = None
 | |
|         if len(items) > 1:
 | |
|             logger.error(f'Duplicate folder name of {foldername} which should never happen')
 | |
| 
 | |
|         for item in items:
 | |
|             logger.debug(f"Found existing folder of {item['name']}")
 | |
|             folder_id_to_upload_to = item['id']
 | |
| 
 | |
|         if folder_id_to_upload_to is None:
 | |
|             logger.debug(f'Creating new folder {foldername}')
 | |
|             file_metadata = {
 | |
|                 'name': [foldername],
 | |
|                 'mimeType': 'application/vnd.google-apps.folder',
 | |
|                 'parents': [self.root_folder_id]
 | |
|             }
 | |
|             gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
 | |
|             folder_id_to_upload_to = gd_file.get('id')
 | |
|         
 | |
|         # check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg'
 | |
|         # happens doing thumbnails
 | |
| 
 | |
|         # will always return a and a blank b even if there is nothing to split
 | |
|         # https://stackoverflow.com/a/38149500/26086
 | |
|         a, _, b = filename.partition('/')
 | |
| 
 | |
|         if b != '':
 | |
|             # a: 'youtube_dl_sDE-qZdi8p8'
 | |
|             # b: 'out1.jpg'
 | |
|             logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
 | |
| 
 | |
|             # does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
 | |
|             results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \
 | |
|                                                 and mimeType='application/vnd.google-apps.folder' \
 | |
|                                                 and name = '{a}' ",
 | |
|                                             spaces='drive', # ie not appDataFolder or photos
 | |
|                                             fields='files(id, name)'
 | |
|                                             ).execute()
 | |
|             items = results.get('files', [])
 | |
|             sub_folder_id_to_upload_to = None
 | |
|             if len(items) > 1:
 | |
|                 logger.error(f'Duplicate folder name of {a} which should never happen')
 | |
| 
 | |
|             for item in items:
 | |
|                 logger.debug(f"Found existing folder of {item['name']}")
 | |
|                 sub_folder_id_to_upload_to = item['id']
 | |
| 
 | |
|             if sub_folder_id_to_upload_to is None:
 | |
|                 # create new folder
 | |
|                 file_metadata = {
 | |
|                     'name': [a],
 | |
|                     'mimeType': 'application/vnd.google-apps.folder',
 | |
|                     'parents': [folder_id_to_upload_to]
 | |
|                 }
 | |
|                 gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
 | |
|                 sub_folder_id_to_upload_to = gd_file.get('id')
 | |
| 
 | |
|             filename = b
 | |
|             folder_id_to_upload_to = sub_folder_id_to_upload_to
 | |
|             # back to normal control flow
 | |
|             
 | |
|         # else:
 | |
|         # upload file to gd
 | |
|         file_metadata = {
 | |
|             # 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
 | |
|             'name': [filename],
 | |
|             'parents': [folder_id_to_upload_to]
 | |
|         }
 | |
|         media = MediaFileUpload(file, resumable=True)
 | |
|         gd_file = self.service.files().create(body=file_metadata,
 | |
|                                             media_body=media,
 | |
|                                             fields='id').execute()
 |