kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge pull request #58 from bellingcat/dev
commit
0bd9e043ed
|
@ -16,4 +16,7 @@ config.yaml
|
||||||
config-*.yaml
|
config-*.yaml
|
||||||
logs/*
|
logs/*
|
||||||
local_archive/
|
local_archive/
|
||||||
vk_config*.json
|
vk_config*.json
|
||||||
|
gd-token.json
|
||||||
|
credentials.json
|
||||||
|
secrets/*
|
|
@ -26,8 +26,8 @@ class ArchiveResult:
|
||||||
screenshot: str = None
|
screenshot: str = None
|
||||||
hash: str = None
|
hash: str = None
|
||||||
|
|
||||||
|
|
||||||
class Archiver(ABC):
|
class Archiver(ABC):
|
||||||
|
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
|
||||||
name = "default"
|
name = "default"
|
||||||
retry_regex = r"retrying at (\d+)$"
|
retry_regex = r"retrying at (\d+)$"
|
||||||
|
|
||||||
|
@ -47,7 +47,6 @@ class Archiver(ABC):
|
||||||
def get_netloc(self, url):
|
def get_netloc(self, url):
|
||||||
return urlparse(url).netloc
|
return urlparse(url).netloc
|
||||||
|
|
||||||
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
|
||||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||||
"""
|
"""
|
||||||
Generates an index.html page where each @urls_info is displayed
|
Generates an index.html page where each @urls_info is displayed
|
||||||
|
@ -163,10 +162,12 @@ class Archiver(ABC):
|
||||||
def get_hash(self, filename):
|
def get_hash(self, filename):
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
bytes = f.read() # read entire file as bytes
|
bytes = f.read() # read entire file as bytes
|
||||||
# TODO: customizable hash
|
logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
|
||||||
hash = hashlib.sha256(bytes)
|
|
||||||
# option to use SHA3_512 instead
|
if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
|
||||||
# hash = hashlib.sha3_512(bytes)
|
elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||||
|
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
|
||||||
|
|
||||||
return hash.hexdigest()
|
return hash.hexdigest()
|
||||||
|
|
||||||
def get_screenshot(self, url):
|
def get_screenshot(self, url):
|
||||||
|
|
|
@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):
|
||||||
|
|
||||||
for u in urls:
|
for u in urls:
|
||||||
if u is None:
|
if u is None:
|
||||||
logger.error(f"Should not have gotten None url for {tweet.includes.media=}")
|
logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
|
||||||
return self.download_alternative(url, tweet_id)
|
return self.download_alternative(url, tweet_id)
|
||||||
logger.debug(f"found {urls=}")
|
logger.debug(f"found {urls=}")
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,12 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||||
|
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
|
||||||
class TwitterArchiver(Archiver):
|
class TwitterArchiver(Archiver):
|
||||||
"""
|
"""
|
||||||
This Twitter Archiver uses unofficial scraping methods, and it works as
|
This Twitter Archiver uses unofficial scraping methods, and it works as
|
||||||
an alternative to TwitterApiArchiver when no API credentials are provided.
|
an alternative to TwitterApiArchiver when no API credentials are provided.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "twitter"
|
name = "twitter"
|
||||||
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||||
|
|
||||||
|
|
|
@ -53,11 +53,25 @@ def missing_required_columns(gw: GWorksheet):
|
||||||
return missing
|
return missing
|
||||||
|
|
||||||
|
|
||||||
|
def should_process_sheet(c, sheet_name):
|
||||||
|
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
|
||||||
|
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||||
|
return False
|
||||||
|
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
|
||||||
|
# BLOCK rules exist AND sheet name is blocked
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def process_sheet(c: Config):
|
def process_sheet(c: Config):
|
||||||
sh = c.gsheets_client.open(c.sheet)
|
sh = c.gsheets_client.open(c.sheet)
|
||||||
|
|
||||||
# loop through worksheets to check
|
# loop through worksheets to check
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
|
if not should_process_sheet(c, wks.title):
|
||||||
|
logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
|
||||||
|
continue
|
||||||
|
|
||||||
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
|
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
|
||||||
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
|
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
|
||||||
|
|
||||||
|
@ -80,7 +94,7 @@ def process_sheet(c: Config):
|
||||||
if not is_retry: continue
|
if not is_retry: continue
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
try:
|
try:
|
||||||
gw.set_cell(row, 'status', 'Archive in progress')
|
gw.set_cell(row, 'status', 'Archive in progress')
|
||||||
url = expand_url(url)
|
url = expand_url(url)
|
||||||
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
||||||
|
@ -96,7 +110,7 @@ def process_sheet(c: Config):
|
||||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||||
TelegramArchiver(storage, c.webdriver),
|
TelegramArchiver(storage, c.webdriver),
|
||||||
TwitterArchiver(storage, c.webdriver),
|
TwitterArchiver(storage, c.webdriver),
|
||||||
VkArchiver(storage, c.webdriver, c.vk_config),
|
VkArchiver(storage, c.webdriver, c.vk_config),
|
||||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -105,7 +119,7 @@ def process_sheet(c: Config):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
||||||
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result = False
|
result = False
|
||||||
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
import argparse, yaml, json
|
import argparse, yaml, json
|
||||||
|
from archivers.base_archiver import Archiver
|
||||||
import gspread
|
import gspread
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
|
@ -50,6 +51,14 @@ class Config:
|
||||||
|
|
||||||
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
|
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
|
||||||
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
||||||
|
|
||||||
|
def ensure_set(l):
|
||||||
|
# always returns a set of strings, can receive a set or a string
|
||||||
|
l = l if isinstance(l, list) else [l]
|
||||||
|
return set([x for x in l if isinstance(x, str) and len(x) > 0])
|
||||||
|
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
|
||||||
|
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
|
||||||
|
|
||||||
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
|
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
|
||||||
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
|
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
|
||||||
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
|
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
|
||||||
|
@ -73,6 +82,8 @@ class Config:
|
||||||
)
|
)
|
||||||
self.webdriver = "not initialized"
|
self.webdriver = "not initialized"
|
||||||
|
|
||||||
|
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
|
||||||
|
|
||||||
# ---------------------- SECRETS - APIs and service configurations
|
# ---------------------- SECRETS - APIs and service configurations
|
||||||
secrets = self.config.get("secrets", {})
|
secrets = self.config.get("secrets", {})
|
||||||
|
|
||||||
|
@ -107,6 +118,7 @@ class Config:
|
||||||
gd = secrets["google_drive"]
|
gd = secrets["google_drive"]
|
||||||
self.gd_config = GDConfig(
|
self.gd_config = GDConfig(
|
||||||
root_folder_id=gd.get("root_folder_id"),
|
root_folder_id=gd.get("root_folder_id"),
|
||||||
|
oauth_token_filename=gd.get("oauth_token_filename"),
|
||||||
service_account=gd.get("service_account", GDConfig.service_account)
|
service_account=gd.get("service_account", GDConfig.service_account)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -246,9 +258,12 @@ class Config:
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
"config_file": self.config_file,
|
"config_file": self.config_file,
|
||||||
"sheet": self.sheet,
|
"sheet": self.sheet,
|
||||||
|
"worksheet_allow": list(self.worksheet_allow),
|
||||||
|
"worksheet_block": list(self.worksheet_block),
|
||||||
"storage": self.storage,
|
"storage": self.storage,
|
||||||
"header": self.header,
|
"header": self.header,
|
||||||
"check_if_exists": self.check_if_exists,
|
"check_if_exists": self.check_if_exists,
|
||||||
|
"hash_algorithm": Archiver.HASH_ALGORITHM,
|
||||||
"save_logs": self.save_logs,
|
"save_logs": self.save_logs,
|
||||||
"selenium_config": asdict(self.selenium_config),
|
"selenium_config": asdict(self.selenium_config),
|
||||||
"selenium_webdriver": self.webdriver != None,
|
"selenium_webdriver": self.webdriver != None,
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
import os.path
|
||||||
|
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
from google.oauth2.credentials import Credentials
|
||||||
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||||
|
from googleapiclient.discovery import build
|
||||||
|
from googleapiclient.errors import HttpError
|
||||||
|
|
||||||
|
# If creating for first time download the OAuth Client Ids json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs
|
||||||
|
# add "http://localhost:55192/" to the list of "Authorised redirect URIs"
|
||||||
|
# https://davemateer.com/2022/04/28/google-drive-with-python for more information
|
||||||
|
|
||||||
|
# You can run this code to get a new token and verify it belongs to the correct user
|
||||||
|
# This token will be refresh automatically by the auto-archiver
|
||||||
|
|
||||||
|
# Code below from https://developers.google.com/drive/api/quickstart/python
|
||||||
|
|
||||||
|
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
token_file = 'gd-token.json'
|
||||||
|
creds = None
|
||||||
|
|
||||||
|
# The file token.json stores the user's access and refresh tokens, and is
|
||||||
|
# created automatically when the authorization flow completes for the first
|
||||||
|
# time.
|
||||||
|
if os.path.exists(token_file):
|
||||||
|
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
|
||||||
|
|
||||||
|
# If there are no (valid) credentials available, let the user log in.
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
print('Requesting new token')
|
||||||
|
creds.refresh(Request())
|
||||||
|
else:
|
||||||
|
print('First run through so putting up login dialog')
|
||||||
|
# credentials.json downloaded from https://console.cloud.google.com/apis/credentials
|
||||||
|
flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
|
||||||
|
creds = flow.run_local_server(port=55192)
|
||||||
|
# Save the credentials for the next run
|
||||||
|
with open(token_file, 'w') as token:
|
||||||
|
print('Saving new token')
|
||||||
|
token.write(creds.to_json())
|
||||||
|
else:
|
||||||
|
print('Token valid')
|
||||||
|
|
||||||
|
try:
|
||||||
|
service = build('drive', 'v3', credentials=creds)
|
||||||
|
|
||||||
|
# About the user
|
||||||
|
results = service.about().get(fields="*").execute()
|
||||||
|
emailAddress = results['user']['emailAddress']
|
||||||
|
print(emailAddress)
|
||||||
|
|
||||||
|
# Call the Drive v3 API and return some files
|
||||||
|
results = service.files().list(
|
||||||
|
pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
||||||
|
items = results.get('files', [])
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
print('No files found.')
|
||||||
|
return
|
||||||
|
print('Files:')
|
||||||
|
for item in items:
|
||||||
|
print(u'{0} ({1})'.format(item['name'], item['id']))
|
||||||
|
|
||||||
|
except HttpError as error:
|
||||||
|
print(f'An error occurred: {error}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -18,8 +18,19 @@ secrets:
|
||||||
|
|
||||||
# needed if you use storage=gd
|
# needed if you use storage=gd
|
||||||
google_drive:
|
google_drive:
|
||||||
# local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json
|
# To authenticate with google you have two options (1. service account OR 2. OAuth token)
|
||||||
service_account: "service_account.json"
|
|
||||||
|
# 1. service account - storage space will count towards the developer account
|
||||||
|
# filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
|
||||||
|
# service_account: "service_account.json"
|
||||||
|
|
||||||
|
# 2. OAuth token - storage space will count towards the owner of the GDrive folder
|
||||||
|
# (only 1. or 2. - if both specified then this 2. takes precedence)
|
||||||
|
# needs write access on the server so refresh flow works
|
||||||
|
# To get the token, run the file `create_update_test_oauth_token.py`
|
||||||
|
# you can edit that file if you want a different token filename, default is "gd-token.json"
|
||||||
|
oauth_token_filename: "gd-token.json"
|
||||||
|
|
||||||
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
|
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
|
||||||
|
|
||||||
# needed if you use storage=local
|
# needed if you use storage=local
|
||||||
|
@ -65,12 +76,25 @@ secrets:
|
||||||
execution:
|
execution:
|
||||||
# can be overwritten with CMD --sheet=
|
# can be overwritten with CMD --sheet=
|
||||||
sheet: your-sheet-name
|
sheet: your-sheet-name
|
||||||
|
|
||||||
|
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
|
||||||
|
# worksheet_allow and worksheet_block can be single values or lists
|
||||||
|
# if worksheet_allow is specified, worksheet_block is ignored
|
||||||
|
# worksheet_allow:
|
||||||
|
# - Sheet1
|
||||||
|
# - "Sheet 2"
|
||||||
|
# worksheet_block: BlockedSheet
|
||||||
|
|
||||||
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
||||||
header: 1
|
header: 1
|
||||||
# which storage to use, can be overwritten with CMD --storage=
|
# which storage to use, can be overwritten with CMD --storage=
|
||||||
storage: s3
|
storage: s3
|
||||||
# defaults to false, when true will try to avoid duplicate URL archives
|
# defaults to false, when true will try to avoid duplicate URL archives
|
||||||
check_if_exists: true
|
check_if_exists: true
|
||||||
|
|
||||||
|
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
|
||||||
|
# hash_algorithm: SHA-256
|
||||||
|
|
||||||
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
||||||
selenium:
|
selenium:
|
||||||
# values under 10s might mean screenshots fail to grab screenshot
|
# values under 10s might mean screenshots fail to grab screenshot
|
||||||
|
@ -95,3 +119,4 @@ execution:
|
||||||
duration: duration
|
duration: duration
|
||||||
screenshot: screenshot
|
screenshot: screenshot
|
||||||
hash: hash
|
hash: hash
|
||||||
|
|
||||||
|
|
|
@ -8,19 +8,54 @@ from googleapiclient.http import MediaFileUpload
|
||||||
from google.oauth2 import service_account
|
from google.oauth2 import service_account
|
||||||
|
|
||||||
|
|
||||||
|
from google.oauth2.credentials import Credentials
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GDConfig:
|
class GDConfig:
|
||||||
root_folder_id: str
|
root_folder_id: str
|
||||||
folder: str = "default"
|
oauth_token_filename: str
|
||||||
service_account: str = "service_account.json"
|
service_account: str = "service_account.json"
|
||||||
|
folder: str = "default"
|
||||||
|
|
||||||
class GDStorage(Storage):
|
class GDStorage(Storage):
|
||||||
def __init__(self, config: GDConfig):
|
def __init__(self, config: GDConfig):
|
||||||
self.folder = config.folder
|
self.folder = config.folder
|
||||||
self.root_folder_id = config.root_folder_id
|
self.root_folder_id = config.root_folder_id
|
||||||
creds = service_account.Credentials.from_service_account_file(
|
|
||||||
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
|
SCOPES=['https://www.googleapis.com/auth/drive']
|
||||||
|
|
||||||
|
token_file = config.oauth_token_filename
|
||||||
|
if token_file is not None:
|
||||||
|
"""
|
||||||
|
Tokens are refreshed after 1 hour
|
||||||
|
however keep working for 7 days (tbc)
|
||||||
|
so as long as the job doesn't last for 7 days
|
||||||
|
then this method of refreshing only once per run will work
|
||||||
|
see this link for details on the token
|
||||||
|
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||||
|
"""
|
||||||
|
logger.debug(f'Using GD OAuth token {token_file}')
|
||||||
|
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
|
||||||
|
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
logger.debug('Requesting new GD OAuth token')
|
||||||
|
creds.refresh(Request())
|
||||||
|
else:
|
||||||
|
raise Exception("Problem with creds - create the token again")
|
||||||
|
|
||||||
|
# Save the credentials for the next run
|
||||||
|
with open(token_file, 'w') as token:
|
||||||
|
logger.debug('Saving new GD OAuth token')
|
||||||
|
token.write(creds.to_json())
|
||||||
|
else:
|
||||||
|
logger.debug('GD OAuth Token valid')
|
||||||
|
else:
|
||||||
|
gd_service_account = config.service_account
|
||||||
|
logger.debug(f'Using GD Service Account {gd_service_account}')
|
||||||
|
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
|
||||||
|
|
||||||
self.service = build('drive', 'v3', credentials=creds)
|
self.service = build('drive', 'v3', credentials=creds)
|
||||||
|
|
||||||
def get_cdn_url(self, key):
|
def get_cdn_url(self, key):
|
||||||
|
@ -28,6 +63,8 @@ class GDStorage(Storage):
|
||||||
only support files saved in a folder for GD
|
only support files saved in a folder for GD
|
||||||
S3 supports folder and all stored in the root
|
S3 supports folder and all stored in the root
|
||||||
"""
|
"""
|
||||||
|
key = self.clean_key(key)
|
||||||
|
|
||||||
full_name = os.path.join(self.folder, key)
|
full_name = os.path.join(self.folder, key)
|
||||||
parent_id, folder_id = self.root_folder_id, None
|
parent_id, folder_id = self.root_folder_id, None
|
||||||
path_parts = full_name.split(os.path.sep)
|
path_parts = full_name.split(os.path.sep)
|
||||||
|
@ -52,6 +89,8 @@ class GDStorage(Storage):
|
||||||
1. for each sub-folder in the path check if exists or create
|
1. for each sub-folder in the path check if exists or create
|
||||||
2. upload file to root_id/other_paths.../filename
|
2. upload file to root_id/other_paths.../filename
|
||||||
"""
|
"""
|
||||||
|
key = self.clean_key(key)
|
||||||
|
|
||||||
full_name = os.path.join(self.folder, key)
|
full_name = os.path.join(self.folder, key)
|
||||||
parent_id, upload_to = self.root_folder_id, None
|
parent_id, upload_to = self.root_folder_id, None
|
||||||
path_parts = full_name.split(os.path.sep)
|
path_parts = full_name.split(os.path.sep)
|
||||||
|
@ -77,13 +116,21 @@ class GDStorage(Storage):
|
||||||
# GD only requires the filename not a file reader
|
# GD only requires the filename not a file reader
|
||||||
self.uploadf(filename, key, **kwargs)
|
self.uploadf(filename, key, **kwargs)
|
||||||
|
|
||||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
|
def clean_key(self, key):
|
||||||
|
# GDrive does not work well with trailing forward slashes and some keys come with that
|
||||||
|
if key.startswith('/'):
|
||||||
|
logger.debug(f'Found and fixed a leading "/" for {key=}')
|
||||||
|
return key[1:]
|
||||||
|
return key
|
||||||
|
|
||||||
|
# gets the Drive folderID if it is there
|
||||||
|
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
||||||
"""
|
"""
|
||||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||||
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||||
If @raise_on_missing will throw error when not found, or returns None
|
If @raise_on_missing will throw error when not found, or returns None
|
||||||
Will remember previous calls to avoid duplication if @use_cache
|
Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
|
||||||
Returns the id of the file or folder from its name as a string
|
Returns the id of the file or folder from its name as a string
|
||||||
"""
|
"""
|
||||||
# cache logic
|
# cache logic
|
||||||
|
@ -96,7 +143,7 @@ class GDStorage(Storage):
|
||||||
|
|
||||||
# API logic
|
# API logic
|
||||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||||
query_string = f"'{parent_id}' in parents and name = '{name}' "
|
query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
|
||||||
if use_mime_type:
|
if use_mime_type:
|
||||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import uuid, os
|
import uuid, os, mimetypes
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
|
@ -21,6 +21,7 @@ class S3Config:
|
||||||
private: bool = False
|
private: bool = False
|
||||||
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
|
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
|
||||||
|
|
||||||
|
|
||||||
class S3Storage(Storage):
|
class S3Storage(Storage):
|
||||||
|
|
||||||
def __init__(self, config: S3Config):
|
def __init__(self, config: S3Config):
|
||||||
|
@ -70,4 +71,5 @@ class S3Storage(Storage):
|
||||||
extra_args = kwargs.get("extra_args", {})
|
extra_args = kwargs.get("extra_args", {})
|
||||||
else:
|
else:
|
||||||
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
||||||
|
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
||||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
||||||
|
|
Ładowanie…
Reference in New Issue