kopia lustrzana https://github.com/bellingcat/auto-archiver
auto
rodzic
6bca0d979c
commit
04e9808049
|
|
@ -9,4 +9,7 @@ __pycache__/
|
|||
anu.html
|
||||
*.log
|
||||
.pytest_cach
|
||||
anon*
|
||||
|
||||
anon*
|
||||
|
||||
files/
|
||||
|
|
@ -10,7 +10,10 @@
|
|||
"program": "auto_archive.py",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": true,
|
||||
"args": ["--sheet","Test Hashing","--use-filenumber-as-directory=True"]
|
||||
// "args": ["--sheet","Test Hashing"]
|
||||
// "args": ["--sheet","Test Hashing","--use-filenumber-as-directory"]
|
||||
|
||||
"args": ["--sheet","Test Hashing","--use-filenumber-as-directory", "--storage=gd"]
|
||||
},
|
||||
{
|
||||
"name": "Python: auto_archive CIR --sheet",
|
||||
|
|
|
|||
|
|
@ -68,25 +68,25 @@ class Archiver(ABC):
|
|||
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
page_filename = 'tmp/' + page_key
|
||||
|
||||
# DM feature flag
|
||||
# page_cdn gets written to the spreadsheet
|
||||
if filenumber is None:
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
else:
|
||||
page_cdn = self.storage.get_cdn_url(filenumber + "/" + page_key)
|
||||
|
||||
with open(page_filename, "w") as f:
|
||||
f.write(page)
|
||||
|
||||
page_hash = self.get_hash(page_filename)
|
||||
|
||||
# DM feature flag
|
||||
if filenumber != "":
|
||||
if filenumber != None:
|
||||
logger.debug(f'filenumber for directory is {filenumber}')
|
||||
page_key = filenumber + "/" + page_key
|
||||
|
||||
self.storage.upload(page_filename, page_key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
|
||||
# DM feature flag
|
||||
# page_cdn gets written to the spreadsheet
|
||||
if filenumber is None:
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
else:
|
||||
page_cdn = self.storage.get_cdn_url(filenumber + "/" + page_key)
|
||||
return (page_cdn, page_hash, thumbnail)
|
||||
|
||||
# def generate_media_page(self, urls, url, object):
|
||||
|
|
@ -158,7 +158,7 @@ class Archiver(ABC):
|
|||
return hash.hexdigest()
|
||||
|
||||
# eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png
|
||||
def get_screenshot(self, url, filenumber):
|
||||
def get_screenshot(self, url, filenumber, storage="GD"):
|
||||
key = self.get_key(urlparse(url).path.replace(
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = 'tmp/' + key
|
||||
|
|
@ -191,7 +191,8 @@ class Archiver(ABC):
|
|||
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
return self.storage.get_cdn_url(key)
|
||||
foo = self.storage.get_cdn_url(key)
|
||||
return foo
|
||||
|
||||
def get_thumbnails(self, filename, key, duration=None, filenumber=None):
|
||||
thumbnails_folder = filename.split('.')[0] + '/'
|
||||
|
|
@ -219,16 +220,18 @@ class Archiver(ABC):
|
|||
for fname in thumbnails:
|
||||
if fname[-3:] == 'jpg':
|
||||
thumbnail_filename = thumbnails_folder + fname
|
||||
# 'SM0022/youtube_dl_sDE-qZdi8p8/out1.jpg'
|
||||
key = key_folder + fname
|
||||
|
||||
# DM feature flag
|
||||
# if filenumber is not None:
|
||||
# key = filenumber + "/" + key
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
self.storage.upload(thumbnail_filename, key)
|
||||
|
||||
# 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/SM0022/youtube_dl_sDE-qZdi8p8/out1.jpg'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
cdn_urls.append(cdn_url)
|
||||
|
||||
if len(cdn_urls) == 0:
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ class TelegramArchiver(Archiver):
|
|||
if url[-8:] != "?embed=1":
|
||||
url += "?embed=1"
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber=filenumber)
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
|
|
@ -42,7 +42,7 @@ class TelegramArchiver(Archiver):
|
|||
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
||||
images += urls
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber)
|
||||
time_elements = s.find_all('time')
|
||||
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ class TelegramArchiver(Archiver):
|
|||
video_id = video_url.split('/')[-1].split('?')[0]
|
||||
key = self.get_key(video_id)
|
||||
|
||||
# DM feature flag (not tested as telethon gets all requests)
|
||||
# DM feature flag
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import traceback
|
|||
class TwitterArchiver(Archiver):
|
||||
name = "twitter"
|
||||
|
||||
# DM added filenumber params todo fix ""
|
||||
# DM added filenumber params and storage
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
if filenumber is not None:
|
||||
logger.debug(f'filenumber is {filenumber}')
|
||||
|
|
|
|||
|
|
@ -101,10 +101,12 @@ class YoutubeDLArchiver(Archiver):
|
|||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
# filename ='tmp/sDE-qZdi8p8.webm'
|
||||
# key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import datetime
|
||||
import argparse
|
||||
import string
|
||||
import requests
|
||||
import shutil
|
||||
import gspread
|
||||
|
|
@ -11,14 +12,21 @@ import traceback
|
|||
|
||||
import archivers
|
||||
from storages import S3Storage, S3Config
|
||||
from storages.gd_storage import GDConfig, GDStorage
|
||||
from utils import GWorksheet, mkdir_if_not_exists
|
||||
|
||||
import sys
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from google.oauth2 import service_account
|
||||
|
||||
logger.add("logs/1trace.log", level="TRACE")
|
||||
logger.add("logs/2info.log", level="INFO")
|
||||
logger.add("logs/3success.log", level="SUCCESS")
|
||||
logger.add("logs/4warning.log", level="WARNING")
|
||||
logger.add("logs/5error.log", level="ERROR")
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
|
@ -67,19 +75,24 @@ def expand_url(url):
|
|||
return url
|
||||
|
||||
|
||||
def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES, usefilenumber=False):
|
||||
def process_sheet(sheet, usefilenumber, storage, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
sh = gc.open(sheet)
|
||||
|
||||
# DM test raise error for decorator to catch
|
||||
# raise ValueError('A very specific bad thing happened.')
|
||||
|
||||
s3_config = S3Config(
|
||||
bucket=os.getenv('DO_BUCKET'),
|
||||
region=os.getenv('DO_SPACES_REGION'),
|
||||
key=os.getenv('DO_SPACES_KEY'),
|
||||
secret=os.getenv('DO_SPACES_SECRET')
|
||||
)
|
||||
|
||||
gd_config = GDConfig(
|
||||
bucket=os.getenv('DO_BUCKET'),
|
||||
region=os.getenv('DO_SPACES_REGION'),
|
||||
key=os.getenv('DO_SPACES_KEY'),
|
||||
secret=os.getenv('DO_SPACES_SECRET')
|
||||
)
|
||||
|
||||
telegram_config = archivers.TelegramConfig(
|
||||
api_id=os.getenv('TELEGRAM_API_ID'),
|
||||
api_hash=os.getenv('TELEGRAM_API_HASH')
|
||||
|
|
@ -87,8 +100,6 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES, usefilenumbe
|
|||
|
||||
# loop through worksheets to check
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
# logger.info(f'Opening worksheet {ii}: "{wks.title}" header={header}')
|
||||
# DM take " out of log message and clarify ii
|
||||
logger.info(f'Opening worksheet ii={ii}: {wks.title} header={header}')
|
||||
gw = GWorksheet(wks, header_row=header, columns=columns)
|
||||
|
||||
|
|
@ -106,6 +117,9 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES, usefilenumbe
|
|||
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
|
||||
s3_client = S3Storage(s3_config)
|
||||
|
||||
gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
|
||||
gd_client = GDStorage(gd_config)
|
||||
|
||||
# loop through rows in worksheet
|
||||
for row in range(1 + header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url')
|
||||
|
|
@ -139,16 +153,23 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES, usefilenumbe
|
|||
# DM put in for telegram screenshots which don't come back
|
||||
driver.set_page_load_timeout(120)
|
||||
|
||||
# client
|
||||
storage_client = None
|
||||
if storage == "s3":
|
||||
storage_client = s3_client
|
||||
elif storage == "gd":
|
||||
storage_client = gd_client
|
||||
else:
|
||||
raise ValueError(f'Cant get storage_client {storage_client}')
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
# telethon is the API for telegram eg t.me url's
|
||||
archivers.TelethonArchiver(s3_client, driver, telegram_config),
|
||||
archivers.TelegramArchiver(s3_client, driver),
|
||||
archivers.TiktokArchiver(s3_client, driver),
|
||||
# DM pass facebook cookie
|
||||
archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')),
|
||||
archivers.TwitterArchiver(s3_client, driver),
|
||||
archivers.WaybackArchiver(s3_client, driver)
|
||||
archivers.TelethonArchiver(storage_client, driver, telegram_config),
|
||||
archivers.TelegramArchiver(storage_client, driver),
|
||||
archivers.TiktokArchiver(storage_client, driver),
|
||||
archivers.YoutubeDLArchiver(storage_client, driver, os.getenv('FACEBOOK_COOKIE')),
|
||||
archivers.TwitterArchiver(storage_client, driver),
|
||||
archivers.WaybackArchiver(storage_client, driver)
|
||||
]
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on row {row}')
|
||||
|
|
@ -159,7 +180,7 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES, usefilenumbe
|
|||
# using filenumber to store in folders so can't check for existance of that url
|
||||
result = archiver.download(url, check_if_exists=False, filenumber=filenumber)
|
||||
else:
|
||||
result = archiver.download(url, check_if_exists=True, filenumber=filenumber)
|
||||
result = archiver.download(url, check_if_exists=True)
|
||||
|
||||
except Exception as e:
|
||||
result = False
|
||||
|
|
@ -207,7 +228,9 @@ def main():
|
|||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True)
|
||||
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
|
||||
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
|
||||
parser.add_argument('--use-filenumber-as-directory', action='store', dest='usefilenumber', default=False, type=bool, help='False is default and True will save files into a subfolder on cloud storage which has the File Number eg SM3012')
|
||||
parser.add_argument('--use-filenumber-as-directory', action=argparse.BooleanOptionalAction, dest='usefilenumber', \
|
||||
help='Will save files into a subfolder on cloud storage which has the File Number eg SM3012')
|
||||
parser.add_argument('--storage', action='store', dest='storage', default='s3', help='s3 or gd storage. Default is s3')
|
||||
|
||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})')
|
||||
|
|
@ -215,11 +238,19 @@ def main():
|
|||
args = parser.parse_args()
|
||||
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
|
||||
|
||||
logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber}')
|
||||
logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber} and storage {args.storage}')
|
||||
|
||||
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
|
||||
# filenumber is True (of type bool) when set or None when argument is not there
|
||||
logger.debug(f'usefilenumber type is {type(args.usefilenumber)}')
|
||||
# explicitly setting usefilenumber to a bool
|
||||
usefilenumber = False
|
||||
if args.usefilenumber:
|
||||
usefilenumber = True
|
||||
|
||||
mkdir_if_not_exists('tmp')
|
||||
# DM added a feature flag for usefilenumber
|
||||
process_sheet(args.sheet, header=args.header, columns=config_columns, usefilenumber=args.usefilenumber)
|
||||
# DM added usefilenumber (default is False) and storage (default is s3) or gd (Google Drive)
|
||||
process_sheet(args.sheet, usefilenumber=usefilenumber, storage=args.storage, header=args.header, columns=config_columns)
|
||||
shutil.rmtree('tmp')
|
||||
|
||||
|
||||
|
|
|
|||
59
dm_drive.py
59
dm_drive.py
|
|
@ -1,59 +0,0 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import os.path
|
||||
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
|
||||
# If modifying these scopes, delete the file token.json.
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive.metadata.readonly']
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
|
||||
"""Shows basic usage of the Drive v3 API.
|
||||
Prints the names and ids of the first 10 files the user has access to.
|
||||
"""
|
||||
creds = None
|
||||
# The file token.json stores the user's access and refresh tokens, and is
|
||||
# created automatically when the authorization flow completes for the first
|
||||
# time.
|
||||
if os.path.exists('token.json'):
|
||||
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
|
||||
# If there are no (valid) credentials available, let the user log in.
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
'credentials.json', SCOPES)
|
||||
creds = flow.run_local_server(port=0)
|
||||
# Save the credentials for the next run
|
||||
with open('token.json', 'w') as token:
|
||||
token.write(creds.to_json())
|
||||
|
||||
try:
|
||||
service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
# Call the Drive v3 API
|
||||
results = service.files().list(
|
||||
pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
if not items:
|
||||
print('No files found.')
|
||||
return
|
||||
print('Files:')
|
||||
for item in items:
|
||||
print(u'{0} ({1})'.format(item['name'], item['id']))
|
||||
except HttpError as error:
|
||||
# TODO(developer) - Handle errors from drive API.
|
||||
print(f'An error occurred: {error}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,7 +1,4 @@
|
|||
|
||||
from __future__ import print_function
|
||||
|
||||
import google.auth
|
||||
# from __future__ import print_function
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
|
|
@ -15,9 +12,182 @@ def upload_appdata():
|
|||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
|
||||
|
||||
try:
|
||||
# call drive api client
|
||||
service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
# 1. list all files and folders
|
||||
# results = service.files().list().execute()
|
||||
# items = results.get('files', [])
|
||||
|
||||
# for item in items:
|
||||
# print(u'{0} ({1})'.format(item['name'], item['id']))
|
||||
|
||||
# 1.5. Upload photo.jpg image to folder
|
||||
# # Hash (davemateer@gmail.com)
|
||||
dm_hash_folder_id ='1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X'
|
||||
# # Files auto-archiver (CIR and linked to dave@hmsoftware.co.uk)
|
||||
cir_faa_folder_id ='1H2RWV89kSjjS2CJJjAF_YHW3kiXjxm69'
|
||||
|
||||
# file_metadata = {
|
||||
# 'name': 'photo.jpg',
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/photo.jpg',
|
||||
# mimetype='image/jpeg',
|
||||
# resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
|
||||
# # 2.upload anohter jpg
|
||||
# file_metadata = {
|
||||
# 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/twitter__media_FMQg7yeXwAAwNEi.jpg',
|
||||
# # mimetype='image/jpeg',
|
||||
# resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
|
||||
# # 3.upload html
|
||||
# file_metadata = {
|
||||
# 'name': 'index.html',
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/index.html',
|
||||
# # mimetype='image/jpeg',
|
||||
# resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
# # 4.upload more html
|
||||
# filename = 'twitter__minmyatnaing13_status_1499415562937503751.html'
|
||||
# file_metadata = {
|
||||
# 'name': [filename],
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/' + filename, resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
|
||||
# # png
|
||||
# filename = 'youtube_dl__@user52610777_video_70170346222991639302022-04-29T07_25_32.069610.png'
|
||||
# file_metadata = {
|
||||
# 'name': [filename],
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/' + filename, resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
# # mkv
|
||||
# filename = 'youtube_dl_343188674422293.mkv'
|
||||
# file_metadata = {
|
||||
# 'name': [filename],
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/' + filename, resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
|
||||
# # mp4
|
||||
# filename = 'youtube_dl_7017034622299163930.mp4'
|
||||
# file_metadata = {
|
||||
# 'name': [filename],
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/' + filename, resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
|
||||
# # webm
|
||||
# filename = 'youtube_dl_sDE-qZdi8p8.webm'
|
||||
# file_metadata = {
|
||||
# 'name': [filename],
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/' + filename, resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
|
||||
|
||||
# 5. List only folders
|
||||
# results = service.files().list(q="mimeType='application/vnd.google-apps.folder'",
|
||||
# spaces='drive', # ie not appDataFolder or photos
|
||||
# fields='files(id, name)'
|
||||
# ).execute()
|
||||
# items = results.get('files', [])
|
||||
|
||||
# for item in items:
|
||||
# foo = item['name'] + item['id']
|
||||
# print(foo)
|
||||
|
||||
# 6. List only folders within a folder (but not subfolders eg SM005 SM006 but not Subfolder inside SM0005)
|
||||
# results = service.files().list(q="'1H2RWV89kSjjS2CJJjAF_YHW3kiXjxm69' in parents",
|
||||
results = service.files().list(q=f"'{dm_hash_folder_id}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
# for item in items:
|
||||
# foo = item['name'] + " " + item['id']
|
||||
# print(foo)
|
||||
|
||||
# 7. Does folder exist within a folder eg SM0005 inside hash and get ID if it does
|
||||
results = service.files().list(q=f"'{dm_hash_folder_id}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' \
|
||||
and name = 'SM0005' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
for item in items:
|
||||
foo = item['name'] + " " + item['id']
|
||||
print(foo)
|
||||
|
||||
|
||||
# 8. Create folder within Files auto-archiver shared folder
|
||||
# file_metadata = {
|
||||
# 'name': 'foo',
|
||||
# 'mimeType': 'application/vnd.google-apps.folder',
|
||||
# 'parents': [folder_id]
|
||||
# }
|
||||
# file = service.files().create(body=file_metadata, fields='id').execute()
|
||||
# new_folder_id = file.get('id')
|
||||
|
||||
# Upload file to newly created folder
|
||||
# filename = 'youtube_dl_sDE-qZdi8p8.webm'
|
||||
# file_metadata = {
|
||||
# 'name': [filename],
|
||||
# 'parents': [new_folder_id]
|
||||
# }
|
||||
# media = MediaFileUpload('files/' + filename, resumable=True)
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
|
||||
# does folder exist already inside parent of Files auto-archiver
|
||||
|
||||
#
|
||||
# List and do paging
|
||||
|
||||
except HttpError as error:
|
||||
print(F'An error occurred: {error}')
|
||||
file = None
|
||||
|
||||
# return file.get('id')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
upload_appdata()
|
||||
|
||||
# create a file in a new folder
|
||||
# file_metadata = {
|
||||
# 'name': 'Invoices',
|
||||
|
|
@ -28,34 +198,15 @@ def upload_appdata():
|
|||
# print('Folder ID: %s' % file.get('id'))
|
||||
|
||||
# upload an image
|
||||
file_metadata = {'name': 'photo.jpg'}
|
||||
media = MediaFileUpload('files/photo.jpg',
|
||||
mimetype='image/jpeg')
|
||||
file = service.files().create(body=file_metadata,
|
||||
media_body=media,
|
||||
fields='id').execute()
|
||||
id = file.get('id')
|
||||
print(f'id: {id}')
|
||||
# file_metadata = {'name': 'photo.jpg'}
|
||||
# media = MediaFileUpload('files/photo.jpg',
|
||||
# mimetype='image/jpeg')
|
||||
# file = service.files().create(body=file_metadata,
|
||||
# media_body=media,
|
||||
# fields='id').execute()
|
||||
# id = file.get('id')
|
||||
# print(f'id: {id}')
|
||||
|
||||
# list files and folders
|
||||
results = service.files().list().execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
for item in items:
|
||||
print(u'{0} ({1})'.format(item['name'], item['id']))
|
||||
|
||||
# upload an image to a folder
|
||||
folder_id ='1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X'
|
||||
file_metadata = {
|
||||
'name': 'photo.jpg',
|
||||
'parents': [folder_id]
|
||||
}
|
||||
media = MediaFileUpload('files/photo.jpg',
|
||||
mimetype='image/jpeg',
|
||||
resumable=True)
|
||||
file = service.files().create(body=file_metadata,
|
||||
media_body=media,
|
||||
fields='id').execute()
|
||||
|
||||
# print 'File ID: %s' % file.get('id')
|
||||
|
||||
|
|
@ -70,13 +221,3 @@ def upload_appdata():
|
|||
# file = service.files().create(body=file_metadata, media_body=media,
|
||||
# fields='id').execute()
|
||||
# print(F'File ID: {file.get("id")}')
|
||||
|
||||
except HttpError as error:
|
||||
print(F'An error occurred: {error}')
|
||||
file = None
|
||||
|
||||
return file.get('id')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
upload_appdata()
|
||||
|
|
@ -17,5 +17,19 @@ class Storage(ABC):
|
|||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
with open(filename, 'rb') as f:
|
||||
self.uploadf(f, key, **kwargs)
|
||||
# S3 requires and open file, GD only the filename
|
||||
foo = type(self).__name__
|
||||
if foo == "GDStorage":
|
||||
self.uploadf(filename, key, **kwargs)
|
||||
elif foo == "S3Storage":
|
||||
with open(filename, 'rb') as f:
|
||||
self.uploadf(f, key, **kwargs)
|
||||
else:
|
||||
raise ValueError('Cant get storage thrown from base_storage.py')
|
||||
|
||||
|
||||
# S3 storage requires onen file
|
||||
# with open(filename, 'rb') as f:
|
||||
# self.uploadf(f, key, **kwargs)
|
||||
# GD storage requires filename
|
||||
# self.uploadf(filename, key, **kwargs)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,241 @@
|
|||
# import boto3
|
||||
# from botocore.errorfactory import ClientError
|
||||
from loguru import logger
|
||||
from .base_storage import Storage
|
||||
from dataclasses import dataclass
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.errors import HttpError
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from google.oauth2 import service_account
|
||||
|
||||
|
||||
@dataclass
|
||||
class GDConfig:
|
||||
bucket: str
|
||||
region: str
|
||||
key: str
|
||||
secret: str
|
||||
folder: str = ""
|
||||
private: bool = False
|
||||
|
||||
|
||||
class GDStorage(Storage):
|
||||
|
||||
def __init__(self, config: GDConfig):
|
||||
self.bucket = config.bucket
|
||||
self.region = config.region
|
||||
self.folder = config.folder
|
||||
self.private = config.private
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
|
||||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
|
||||
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
# if len(self.folder) and self.folder[-1] != '/':
|
||||
# self.folder += '/'
|
||||
|
||||
# self.s3 = boto3.client(
|
||||
# 's3',
|
||||
# region_name=self.region,
|
||||
# endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
|
||||
# aws_access_key_id=config.key,
|
||||
# aws_secret_access_key=config.secret
|
||||
# )
|
||||
|
||||
def _get_path(self, key):
|
||||
return self.folder + key
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
# key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg
|
||||
|
||||
directory = key.split('/', 1)[0]
|
||||
logger.debug(f'directory: {directory}')
|
||||
# eg twitter__media_asdf.jpg
|
||||
filename = key.split('/', 1)[1]
|
||||
logger.debug(f'filename: {filename}')
|
||||
|
||||
|
||||
# TODO put that back to CIR value!
|
||||
cir_faa_folder_id ='1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X'
|
||||
|
||||
# need to lookup the id of folder eg SM0002
|
||||
results = self.service.files().list(q=f"'{cir_faa_folder_id}' in parents \
|
||||
and name = '{directory}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
folder_id = None
|
||||
for item in items:
|
||||
logger.debug(f"found folder of {item['name']}")
|
||||
folder_id= item['id']
|
||||
|
||||
if folder_id is None:
|
||||
raise ValueError('Cant find folder')
|
||||
|
||||
# check for folder name in file eg youtube_dl_sDE-qZdi8p8/index.html'
|
||||
# happens doing thumbnails
|
||||
|
||||
# will always return a and a blank b even if there is nothing to split
|
||||
a, _, b = filename.partition('/')
|
||||
|
||||
if b != '':
|
||||
# a: 'youtube_dl_sDE-qZdi8p8'
|
||||
# b: 'index.html'
|
||||
logger.debug(f'xxxx need to split on a: {a} and {b}')
|
||||
|
||||
|
||||
|
||||
# get id of the sub folder
|
||||
results = self.service.files().list(q=f"'{folder_id}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' \
|
||||
and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
filename = None
|
||||
for item in items:
|
||||
folder_id = item['id']
|
||||
filename = b
|
||||
if filename is None:
|
||||
raise ValueError('Problem finding folder')
|
||||
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
results = self.service.files().list(q=f"'{folder_id}' in parents \
|
||||
and name = '{filename}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
file_id = None
|
||||
for item in items:
|
||||
logger.debug(f"found file of {item['name']}")
|
||||
file_id= item['id']
|
||||
|
||||
if file_id is None:
|
||||
raise ValueError('Problem finding file')
|
||||
|
||||
foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
|
||||
|
||||
return foo
|
||||
# return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
|
||||
|
||||
def exists(self, key):
|
||||
# try:
|
||||
# self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
|
||||
# return True
|
||||
# except ClientError:
|
||||
# return False
|
||||
return False
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
# if self.private:
|
||||
# extra_args = kwargs.get("extra_args", {})
|
||||
# else:
|
||||
# extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
||||
|
||||
dm_hash_folder_id ='1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X'
|
||||
|
||||
# Files auto-archiver (CIR and linked to dave@hmsoftware.co.uk)
|
||||
# cir_faa_folder_id ='1H2RWV89kSjjS2CJJjAF_YHW3kiXjxm69'
|
||||
# TODO put that back to CIR value!
|
||||
cir_faa_folder_id ='1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X'
|
||||
|
||||
# Assuming using filenumber as a folder eg SM0002
|
||||
# key is 'SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg'
|
||||
|
||||
# split on first occurance of /
|
||||
# eg SM0005
|
||||
directory = key.split('/', 1)[0]
|
||||
# eg twitter__media_asdf.jpg
|
||||
filename = key.split('/', 1)[1]
|
||||
|
||||
# does folder eg SM0005 exist already inside parent of Files auto-archiver
|
||||
results = self.service.files().list(q=f"'{cir_faa_folder_id}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' \
|
||||
and name = '{directory}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
logger.error(f'Duplicate folder name of {directory} which should never happen')
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"Found existing folder of {item['name']}")
|
||||
folder_id_to_upload_to = item['id']
|
||||
|
||||
if folder_id_to_upload_to is None:
|
||||
# create new folder
|
||||
file_metadata = {
|
||||
'name': [directory],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [cir_faa_folder_id]
|
||||
}
|
||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
folder_id_to_upload_to = gd_file.get('id')
|
||||
|
||||
|
||||
|
||||
# check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg'
|
||||
# happens doing thumbnails
|
||||
|
||||
# will always return a and a blank b even if there is nothing to split
|
||||
# https://stackoverflow.com/a/38149500/26086
|
||||
a, _, b = filename.partition('/')
|
||||
|
||||
if b != '':
|
||||
# a: 'youtube_dl_sDE-qZdi8p8'
|
||||
# b: 'out1.jpg'
|
||||
logger.debug(f'need to split')
|
||||
|
||||
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
|
||||
results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \
|
||||
and mimeType='application/vnd.google-apps.folder' \
|
||||
and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
sub_folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
logger.error(f'Duplicate folder name of {a} which should never happen')
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"Found existing folder of {item['name']}")
|
||||
sub_folder_id_to_upload_to = item['id']
|
||||
|
||||
if sub_folder_id_to_upload_to is None:
|
||||
# create new folder
|
||||
file_metadata = {
|
||||
'name': [a],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [folder_id_to_upload_to]
|
||||
}
|
||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
sub_folder_id_to_upload_to = gd_file.get('id')
|
||||
|
||||
filename = b
|
||||
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
||||
# back to normal control flow
|
||||
|
||||
# else:
|
||||
# upload file to gd
|
||||
file_metadata = {
|
||||
# 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
|
||||
'name': [filename],
|
||||
'parents': [folder_id_to_upload_to]
|
||||
}
|
||||
media = MediaFileUpload(file, resumable=True)
|
||||
gd_file = self.service.files().create(body=file_metadata,
|
||||
media_body=media,
|
||||
fields='id').execute()
|
||||
Ładowanie…
Reference in New Issue