kopia lustrzana https://github.com/bellingcat/auto-archiver
refactor
rodzic
d46b8e1157
commit
f87acb6d1d
|
@ -13,3 +13,4 @@ anon*
|
||||||
config.json
|
config.json
|
||||||
config-*.json
|
config-*.json
|
||||||
logs/*
|
logs/*
|
||||||
|
local_archive/
|
|
@ -35,6 +35,9 @@ class Archiver(ABC):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__str__()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def download(self, url, check_if_exists=False): pass
|
def download(self, url, check_if_exists=False): pass
|
||||||
|
|
||||||
|
@ -134,6 +137,7 @@ class Archiver(ABC):
|
||||||
return hash.hexdigest()
|
return hash.hexdigest()
|
||||||
|
|
||||||
def get_screenshot(self, url):
|
def get_screenshot(self, url):
|
||||||
|
logger.debug(f"getting screenshot for {url=}")
|
||||||
key = self.get_key(urlparse(url).path.replace(
|
key = self.get_key(urlparse(url).path.replace(
|
||||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||||
filename = Storage.TMP_FOLDER + key
|
filename = Storage.TMP_FOLDER + key
|
||||||
|
|
|
@ -18,8 +18,8 @@ class TiktokArchiver(Archiver):
|
||||||
try:
|
try:
|
||||||
info = tiktok_downloader.info_post(url)
|
info = tiktok_downloader.info_post(url)
|
||||||
key = self.get_key(f'{info.id}.mp4')
|
key = self.get_key(f'{info.id}.mp4')
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
|
||||||
filename = Storage.TMP_FOLDER + key
|
filename = Storage.TMP_FOLDER + key
|
||||||
|
logger.info(f'found video {key=}')
|
||||||
|
|
||||||
if check_if_exists and self.storage.exists(key):
|
if check_if_exists and self.storage.exists(key):
|
||||||
status = 'already archived'
|
status = 'already archived'
|
||||||
|
@ -28,13 +28,15 @@ class TiktokArchiver(Archiver):
|
||||||
|
|
||||||
if len(media) <= 0:
|
if len(media) <= 0:
|
||||||
if status == 'already archived':
|
if status == 'already archived':
|
||||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
|
return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
|
||||||
else:
|
else:
|
||||||
return ArchiveResult(status='Could not download media')
|
return ArchiveResult(status='Could not download media')
|
||||||
|
|
||||||
|
logger.info(f'downloading video {key=}')
|
||||||
media[0].download(filename)
|
media[0].download(filename)
|
||||||
|
|
||||||
if status != 'already archived':
|
if status != 'already archived':
|
||||||
|
logger.info(f'uploading video {key=}')
|
||||||
self.storage.upload(filename, key)
|
self.storage.upload(filename, key)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -50,6 +52,7 @@ class TiktokArchiver(Archiver):
|
||||||
try: os.remove(filename)
|
try: os.remove(filename)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
logger.info(f'tmp file not found thus not deleted {filename}')
|
logger.info(f'tmp file not found thus not deleted {filename}')
|
||||||
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
|
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
|
||||||
|
|
|
@ -8,26 +8,31 @@ from .base_archiver import Archiver, ArchiveResult
|
||||||
from configs import WaybackConfig
|
from configs import WaybackConfig
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WaybackArchiver(Archiver):
|
class WaybackArchiver(Archiver):
|
||||||
name = "wayback"
|
name = "wayback"
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
||||||
super(WaybackArchiver, self).__init__(storage, driver)
|
super(WaybackArchiver, self).__init__(storage, driver)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
# TODO: this logic should live at the auto-archiver level
|
||||||
self.seen_urls = {}
|
self.seen_urls = {}
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
if check_if_exists and url in self.seen_urls:
|
if check_if_exists:
|
||||||
return self.seen_urls[url]
|
if url in self.seen_urls: return self.seen_urls[url]
|
||||||
|
|
||||||
|
logger.debug(f"checking if {url=} already on archive.org")
|
||||||
|
archive_url = f"https://web.archive.org/web/{url}"
|
||||||
|
req = requests.get(archive_url)
|
||||||
|
if req.status_code == 200:
|
||||||
|
return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived')
|
||||||
|
|
||||||
|
logger.debug(f"POSTing {url=} to web.archive.org")
|
||||||
ia_headers = {
|
ia_headers = {
|
||||||
"Accept": "application/json",
|
"Accept": "application/json",
|
||||||
"Authorization": f"LOW {self.config.key}:{self.config.secret}"
|
"Authorization": f"LOW {self.config.key}:{self.config.secret}"
|
||||||
}
|
}
|
||||||
|
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||||
r = requests.post(
|
|
||||||
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||||
|
@ -38,47 +43,41 @@ class WaybackArchiver(Archiver):
|
||||||
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||||
|
|
||||||
job_id = r.json()['job_id']
|
job_id = r.json()['job_id']
|
||||||
|
logger.debug(f"GETting status for {job_id=} on {url=}")
|
||||||
status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||||
|
|
||||||
retries = 0
|
retries = 0
|
||||||
|
|
||||||
|
# TODO: make the job queue parallel -> consider propagation of results back to sheet though
|
||||||
# wait 90-120 seconds for the archive job to finish
|
# wait 90-120 seconds for the archive job to finish
|
||||||
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
|
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
status_r = requests.get(
|
logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
|
||||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||||
except:
|
except:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
retries += 1
|
retries += 1
|
||||||
|
|
||||||
if status_r.status_code != 200:
|
if status_r.status_code != 200:
|
||||||
return ArchiveResult(status="Internet archive failed")
|
return ArchiveResult(status="Internet archive failed")
|
||||||
|
|
||||||
status_json = status_r.json()
|
status_json = status_r.json()
|
||||||
|
|
||||||
if status_json['status'] != 'success':
|
if status_json['status'] != 'success':
|
||||||
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||||
|
|
||||||
archive_url = 'https://web.archive.org/web/' + \
|
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
||||||
status_json['timestamp'] + '/' + status_json['original_url']
|
return self.if_archived_return_with_screenshot(archive_url)
|
||||||
|
|
||||||
|
def if_archived_return_with_screenshot(self, url, archive_url, req=None, status='success'):
|
||||||
try:
|
try:
|
||||||
r = requests.get(archive_url)
|
if req is None:
|
||||||
|
req = requests.get(archive_url)
|
||||||
parsed = BeautifulSoup(r.content, 'html.parser')
|
parsed = BeautifulSoup(req.content, 'html.parser')
|
||||||
|
|
||||||
title = parsed.find_all('title')[0].text
|
title = parsed.find_all('title')[0].text
|
||||||
|
|
||||||
if title == 'Wayback Machine':
|
if title == 'Wayback Machine':
|
||||||
title = 'Could not get title'
|
title = 'Could not get title'
|
||||||
except:
|
except:
|
||||||
title = "Could not get title"
|
title = "Could not get title"
|
||||||
|
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
result = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
|
self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||||
self.seen_urls[url] = result
|
return self.seen_urls[url]
|
||||||
return result
|
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
import datetime
|
import os, datetime, shutil, traceback
|
||||||
import shutil
|
|
||||||
import traceback
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
|
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
|
||||||
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
||||||
from configs import Config
|
from configs import Config
|
||||||
|
from storages import Storage
|
||||||
|
|
||||||
|
|
||||||
def update_sheet(gw, row, result: ArchiveResult):
|
def update_sheet(gw, row, result: ArchiveResult):
|
||||||
|
@ -42,12 +43,12 @@ def update_sheet(gw, row, result: ArchiveResult):
|
||||||
|
|
||||||
|
|
||||||
def missing_required_columns(gw: GWorksheet):
|
def missing_required_columns(gw: GWorksheet):
|
||||||
required_found = True
|
missing = False
|
||||||
for required_col in ['url', 'status']:
|
for required_col in ['url', 'status']:
|
||||||
if not gw.col_exists(required_col):
|
if not gw.col_exists(required_col):
|
||||||
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.worksheet.title}')
|
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
|
||||||
required_found = False
|
missing = True
|
||||||
return required_found
|
return missing
|
||||||
|
|
||||||
|
|
||||||
def process_sheet(c: Config):
|
def process_sheet(c: Config):
|
||||||
|
@ -60,9 +61,9 @@ def process_sheet(c: Config):
|
||||||
|
|
||||||
if missing_required_columns(gw): continue
|
if missing_required_columns(gw): continue
|
||||||
|
|
||||||
# archives will be in a folder 'doc_name/worksheet_name'
|
# archives will default to being in a folder 'doc_name/worksheet_name'
|
||||||
# TODO: use slugify lib
|
default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
|
||||||
c.set_folder(f'{c.sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/')
|
c.set_folder(default_folder)
|
||||||
storage = c.get_storage()
|
storage = c.get_storage()
|
||||||
|
|
||||||
# loop through rows in worksheet
|
# loop through rows in worksheet
|
||||||
|
@ -76,7 +77,7 @@ def process_sheet(c: Config):
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
gw.set_cell(row, 'status', 'Archive in progress')
|
gw.set_cell(row, 'status', 'Archive in progress')
|
||||||
url = expand_url(url)
|
url = expand_url(url)
|
||||||
storage.update_properties(subfolder=gw.get_cell_or_default(row, 'subfolder'))
|
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
||||||
|
|
||||||
# make a new driver so each spreadsheet row is idempotent
|
# make a new driver so each spreadsheet row is idempotent
|
||||||
c.recreate_webdriver()
|
c.recreate_webdriver()
|
||||||
|
@ -92,26 +93,27 @@ def process_sheet(c: Config):
|
||||||
]
|
]
|
||||||
|
|
||||||
for archiver in active_archivers:
|
for archiver in active_archivers:
|
||||||
logger.debug(f'Trying {archiver=} on {row=}')
|
logger.debug(f'Trying {archiver} on {row=}')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = archiver.download(url, check_if_exists=True)
|
result = archiver.download(url, check_if_exists=True)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# catches keyboard interruptions to do a clean exit
|
# catches keyboard interruptions to do a clean exit
|
||||||
logger.warning(f"caught interrupt for {archiver=} on {row=}")
|
logger.warning(f"caught interrupt for {archiver} on {row=}")
|
||||||
gw.set_cell(row, 'status', '')
|
gw.set_cell(row, 'status', '')
|
||||||
c.destroy_webdriver()
|
c.destroy_webdriver()
|
||||||
exit()
|
exit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result = False
|
result = False
|
||||||
logger.error(f'Got unexpected error in row {row} with {archiver=} for {url=}: {e}\n{traceback.format_exc()}')
|
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
|
success = result.status in ['success', 'already archived']
|
||||||
result.status = f"{archiver.name}: {result.status}"
|
result.status = f"{archiver.name}: {result.status}"
|
||||||
if result.status in ['success', 'already archived']:
|
if success:
|
||||||
logger.success(f'{archiver=} succeeded on {row=}, {url=}')
|
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
|
||||||
break
|
break
|
||||||
logger.warning(f'{archiver} did not succeed on {row=}, final status: {result.status}')
|
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
update_sheet(gw, row, result)
|
update_sheet(gw, row, result)
|
||||||
|
@ -125,10 +127,10 @@ def main():
|
||||||
c = Config()
|
c = Config()
|
||||||
c.parse()
|
c.parse()
|
||||||
logger.info(f'Opening document {c.sheet} for header {c.header}')
|
logger.info(f'Opening document {c.sheet} for header {c.header}')
|
||||||
mkdir_if_not_exists(c.tmp_folder)
|
mkdir_if_not_exists(Storage.TMP_FOLDER)
|
||||||
process_sheet(c)
|
process_sheet(c)
|
||||||
c.destroy_webdriver()
|
c.destroy_webdriver()
|
||||||
shutil.rmtree(c.tmp_folder)
|
shutil.rmtree(Storage.TMP_FOLDER)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -3,12 +3,12 @@ import argparse, json
|
||||||
import gspread
|
import gspread
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, asdict
|
||||||
|
|
||||||
from utils.gworksheet import GWorksheet
|
from utils import GWorksheet, getattr_or
|
||||||
from .wayback_config import WaybackConfig
|
from .wayback_config import WaybackConfig
|
||||||
from .telethon_config import TelethonConfig
|
from .telethon_config import TelethonConfig
|
||||||
from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage
|
from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -39,6 +39,7 @@ class Config:
|
||||||
self.set_log_files()
|
self.set_log_files()
|
||||||
|
|
||||||
def set_log_files(self):
|
def set_log_files(self):
|
||||||
|
# TODO: isolate to config
|
||||||
logger.add("logs/1trace.log", level="TRACE")
|
logger.add("logs/1trace.log", level="TRACE")
|
||||||
logger.add("logs/2info.log", level="INFO")
|
logger.add("logs/2info.log", level="INFO")
|
||||||
logger.add("logs/3success.log", level="SUCCESS")
|
logger.add("logs/3success.log", level="SUCCESS")
|
||||||
|
@ -59,21 +60,18 @@ class Config:
|
||||||
# ----------------------EXECUTION - execution configurations
|
# ----------------------EXECUTION - execution configurations
|
||||||
execution = self.config.get("execution", {})
|
execution = self.config.get("execution", {})
|
||||||
|
|
||||||
self.sheet = getattr(self.args, "sheet", execution.get("sheet"))
|
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
|
||||||
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
||||||
self.header = int(getattr(self.args, "header", execution.get("header", 1)))
|
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
|
||||||
Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER)
|
Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER)
|
||||||
self.storage = getattr(self.args, "storage", execution.get("storage", "s3"))
|
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
|
||||||
|
|
||||||
for key, name in [("s3", "s3"), ("gd", "google_drive")]:
|
|
||||||
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
|
|
||||||
|
|
||||||
# Column names come from config and can be overwritten by CMD
|
# Column names come from config and can be overwritten by CMD
|
||||||
# in the end all are considered as lower case
|
# in the end all are considered as lower case
|
||||||
config_column_names = execution.get("column_names", {})
|
config_column_names = execution.get("column_names", {})
|
||||||
self.column_names = {}
|
self.column_names = {}
|
||||||
for k in GWorksheet.COLUMN_NAMES.keys():
|
for k in GWorksheet.COLUMN_NAMES.keys():
|
||||||
self.column_names[k] = getattr(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
|
self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
|
||||||
|
|
||||||
# selenium driver
|
# selenium driver
|
||||||
selenium_configs = execution.get("selenium", {})
|
selenium_configs = execution.get("selenium", {})
|
||||||
|
@ -87,6 +85,10 @@ class Config:
|
||||||
# ---------------------- SECRETS - APIs and service configurations
|
# ---------------------- SECRETS - APIs and service configurations
|
||||||
secrets = self.config.get("secrets", {})
|
secrets = self.config.get("secrets", {})
|
||||||
|
|
||||||
|
# assert selected storage credentials exist
|
||||||
|
for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]:
|
||||||
|
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
|
||||||
|
|
||||||
# google sheets config
|
# google sheets config
|
||||||
self.gsheets_client = gspread.service_account(
|
self.gsheets_client = gspread.service_account(
|
||||||
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
|
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
|
||||||
|
@ -106,8 +108,7 @@ class Config:
|
||||||
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
|
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
|
||||||
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
|
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
|
||||||
key_path=s3.get("key_path", S3Config.key_path),
|
key_path=s3.get("key_path", S3Config.key_path),
|
||||||
private=getattr(self.args, "s3-private", s3.get("private", S3Config.private)),
|
private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private))
|
||||||
no_folder=s3.get("no_folder", S3Config.no_folder),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# GDrive config
|
# GDrive config
|
||||||
|
@ -115,8 +116,12 @@ class Config:
|
||||||
gd = secrets["google_drive"]
|
gd = secrets["google_drive"]
|
||||||
self.gd_config = GDConfig(
|
self.gd_config = GDConfig(
|
||||||
root_folder_id=gd.get("root_folder_id"),
|
root_folder_id=gd.get("root_folder_id"),
|
||||||
default_folder=gd.get("default_folder", GDConfig.default_folder),
|
service_account=gd.get("service_account", GDConfig.service_account)
|
||||||
service_account=gd.get("service_account", GDConfig.service_account),
|
)
|
||||||
|
|
||||||
|
if "local" in secrets:
|
||||||
|
self.local_config = LocalConfig(
|
||||||
|
save_to=secrets["local"].get("save_to", LocalConfig.save_to),
|
||||||
)
|
)
|
||||||
|
|
||||||
# wayback machine config
|
# wayback machine config
|
||||||
|
@ -153,30 +158,40 @@ class Config:
|
||||||
|
|
||||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||||
help = f"the name of the column to FILL WITH {k} (default='{v}')"
|
help = f"the name of the column to FILL WITH {k} (default='{v}')"
|
||||||
if k in ["url", "subfolder"]:
|
if k in ["url", "folder"]:
|
||||||
help = f"the name of the column to READ {k} FROM (default='{v}')"
|
help = f"the name of the column to READ {k} FROM (default='{v}')"
|
||||||
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
|
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def set_folder(self, folder):
|
def set_folder(self, folder):
|
||||||
# update the folder in each of the storages
|
"""
|
||||||
|
update the folder in each of the storages
|
||||||
|
"""
|
||||||
self.folder = folder
|
self.folder = folder
|
||||||
if self.s3_config:
|
# s3
|
||||||
self.s3_config.folder = folder
|
if hasattr(self, "s3_config"): self.s3_config.folder = folder
|
||||||
if self.gd_config:
|
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
|
||||||
self.gd_config.default_folder = folder
|
# gdrive
|
||||||
|
if hasattr(self, "gd_config"): self.gd_config.folder = folder
|
||||||
|
if hasattr(self, "gd_storage"): self.gd_storage.folder = folder
|
||||||
|
# local
|
||||||
|
if hasattr(self, "local_config"): self.local_config.folder = folder
|
||||||
|
if hasattr(self, "local_storage"): self.local_storage.folder = folder
|
||||||
|
|
||||||
def get_storage(self):
|
def get_storage(self):
|
||||||
"""
|
"""
|
||||||
creates and returns the configured type of storage
|
returns the configured type of storage, creating if needed
|
||||||
"""
|
"""
|
||||||
if self.storage == "s3":
|
if self.storage == "s3":
|
||||||
return S3Storage(self.s3_config)
|
self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config))
|
||||||
|
return self.s3_storage
|
||||||
elif self.storage == "gd":
|
elif self.storage == "gd":
|
||||||
return GDStorage(self.gd_config)
|
self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config))
|
||||||
|
return self.gd_storage
|
||||||
elif self.storage == "local":
|
elif self.storage == "local":
|
||||||
return LocalStorage(self.folder)
|
self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config))
|
||||||
|
return self.local_storage
|
||||||
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
|
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
|
||||||
|
|
||||||
def destroy_webdriver(self):
|
def destroy_webdriver(self):
|
||||||
|
@ -197,12 +212,13 @@ class Config:
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
"config_file": self.config_file,
|
"config_file": self.config_file,
|
||||||
"sheet": self.sheet,
|
"sheet": self.sheet,
|
||||||
|
"storage": self.storage,
|
||||||
"header": self.header,
|
"header": self.header,
|
||||||
"tmp_folder": Storage.TMP_FOLDER,
|
"tmp_folder": Storage.TMP_FOLDER,
|
||||||
"selenium_config": self.selenium_config,
|
"selenium_config": asdict(self.selenium_config),
|
||||||
"selenium_webdriver": self.webdriver != None,
|
"selenium_webdriver": self.webdriver != None,
|
||||||
"s3_config": self.s3_config != None,
|
"s3_config": self.s3_config != None,
|
||||||
"s3_private": getattr(self.s3_config, "private", None),
|
"s3_private": getattr_or(self.s3_config, "private", None),
|
||||||
"wayback_config": self.wayback_config != None,
|
"wayback_config": self.wayback_config != None,
|
||||||
"telegram_config": self.telegram_config != None,
|
"telegram_config": self.telegram_config != None,
|
||||||
"gsheets_client": self.gsheets_client != None,
|
"gsheets_client": self.gsheets_client != None,
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# we need to explicitly expose the available imports here
|
# we need to explicitly expose the available imports here
|
||||||
from .base_storage import Storage
|
from .base_storage import Storage
|
||||||
from .local_storage import LocalStorage
|
from .local_storage import LocalStorage, LocalConfig
|
||||||
from .s3_storage import S3Config, S3Storage
|
from .s3_storage import S3Config, S3Storage
|
||||||
from .gd_storage import GDConfig, GDStorage
|
from .gd_storage import GDConfig, GDStorage
|
|
@ -23,23 +23,7 @@ class Storage(ABC):
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
self.uploadf(f, key, **kwargs)
|
self.uploadf(f, key, **kwargs)
|
||||||
|
|
||||||
def update_properties(self, **kwargs):
|
#TODO: is this really necessary if only use os.path operations
|
||||||
"""
|
|
||||||
method used to update general properties that some children may use
|
|
||||||
and others not, but that all can call
|
|
||||||
"""
|
|
||||||
for k, v in kwargs.items():
|
|
||||||
if k in self._get_allowed_properties():
|
|
||||||
setattr(self, k, v)
|
|
||||||
else:
|
|
||||||
logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
|
|
||||||
|
|
||||||
def _get_allowed_properties(self):
|
|
||||||
"""
|
|
||||||
child classes should specify which properties they allow to be set
|
|
||||||
"""
|
|
||||||
return set(["subfolder"])
|
|
||||||
|
|
||||||
def _clean_path(self, folder, default="", add_forward_slash=True):
|
def _clean_path(self, folder, default="", add_forward_slash=True):
|
||||||
if folder is None or type(folder) != str or len(folder.strip()) == 0:
|
if folder is None or type(folder) != str or len(folder.strip()) == 0:
|
||||||
return default
|
return default
|
||||||
|
|
|
@ -1,24 +1,23 @@
|
||||||
|
import os, time
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from .base_storage import Storage
|
from .base_storage import Storage
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from googleapiclient.discovery import build
|
from googleapiclient.discovery import build
|
||||||
from googleapiclient.http import MediaFileUpload
|
from googleapiclient.http import MediaFileUpload
|
||||||
from google.oauth2 import service_account
|
from google.oauth2 import service_account
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GDConfig:
|
class GDConfig:
|
||||||
root_folder_id: str
|
root_folder_id: str
|
||||||
default_folder: str = "default"
|
folder: str = "default"
|
||||||
service_account: str = "service_account.json"
|
service_account: str = "service_account.json"
|
||||||
|
|
||||||
|
|
||||||
class GDStorage(Storage):
|
class GDStorage(Storage):
|
||||||
def __init__(self, config: GDConfig):
|
def __init__(self, config: GDConfig):
|
||||||
self.default_folder = config.default_folder
|
self.folder = config.folder
|
||||||
self.root_folder_id = config.root_folder_id
|
self.root_folder_id = config.root_folder_id
|
||||||
creds = service_account.Credentials.from_service_account_file(
|
creds = service_account.Credentials.from_service_account_file(
|
||||||
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
|
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
|
||||||
|
@ -29,77 +28,73 @@ class GDStorage(Storage):
|
||||||
only support files saved in a folder for GD
|
only support files saved in a folder for GD
|
||||||
S3 supports folder and all stored in the root
|
S3 supports folder and all stored in the root
|
||||||
"""
|
"""
|
||||||
self.subfolder = self._clean_path(self.subfolder, self.default_folder, False)
|
full_name = os.path.join(self.folder, key)
|
||||||
filename = key
|
parent_id, folder_id = self.root_folder_id, None
|
||||||
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
|
path_parts = full_name.split(os.path.sep)
|
||||||
|
filename = path_parts[-1]
|
||||||
folder_id = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, 5, 10)
|
logger.info(f"looking for folders for {path_parts=} before uploading {filename=}")
|
||||||
|
for folder in path_parts[0:-1]:
|
||||||
# check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails
|
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||||
# a='youtube_dl_abcde', b='index.html'
|
parent_id = folder_id
|
||||||
a, _, b = filename.partition('/')
|
|
||||||
if b != '':
|
|
||||||
logger.debug(f'get_cdn_url: Found a subfolder so need to split on: {a=} and {b=}')
|
|
||||||
folder_id = self._get_id_from_parent_and_name(folder_id, a, use_mime_type=True)
|
|
||||||
filename = b
|
|
||||||
|
|
||||||
# get id of file inside folder (or sub folder)
|
# get id of file inside folder (or sub folder)
|
||||||
file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
||||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||||
|
|
||||||
def exists(self, _key):
|
def exists(self, key):
|
||||||
# TODO: How to check for google drive, as it accepts different names?
|
try:
|
||||||
return False
|
self.get_cdn_url(key)
|
||||||
|
return True
|
||||||
|
except: return False
|
||||||
|
|
||||||
def uploadf(self, file, key, **_kwargs):
|
def uploadf(self, file: str, key: str, **_kwargs):
|
||||||
"""
|
"""
|
||||||
1. check if subfolder exists or create it
|
1. for each sub-folder in the path check if exists or create
|
||||||
2. check if key contains sub-subfolder, check if exists or create it
|
2. upload file to root_id/other_paths.../filename
|
||||||
3. upload file to root_id/subfolder[/sub-subfolder]/filename
|
|
||||||
"""
|
"""
|
||||||
self.subfolder = self._clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
full_name = os.path.join(self.folder, key)
|
||||||
filename = key
|
parent_id, upload_to = self.root_folder_id, None
|
||||||
|
path_parts = full_name.split(os.path.sep)
|
||||||
# get id of subfolder or create if it does not exist
|
filename = path_parts[-1]
|
||||||
folder_id_to_upload_to = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, use_mime_type=True, raise_on_missing=False)
|
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
|
||||||
if folder_id_to_upload_to is None:
|
for folder in path_parts[0:-1]:
|
||||||
folder_id_to_upload_to = self._mkdir(self.subfolder, self.root_folder_id)
|
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
||||||
|
if upload_to is None:
|
||||||
# check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails
|
upload_to = self._mkdir(folder, parent_id)
|
||||||
# a='youtube_dl_abcde', b='index.html'
|
parent_id = upload_to
|
||||||
a, _, b = filename.partition('/')
|
|
||||||
if b != '':
|
|
||||||
logger.debug(f'uploadf: Found a subfolder so need to split on: {a=} and {b=}')
|
|
||||||
# get id of subfolder or create if it does not exist
|
|
||||||
sub_folder_id_to_upload_to = self._get_id_from_parent_and_name(folder_id_to_upload_to, a, use_mime_type=True, raise_on_missing=False)
|
|
||||||
if sub_folder_id_to_upload_to is None:
|
|
||||||
sub_folder_id_to_upload_to = self._mkdir(a, folder_id_to_upload_to)
|
|
||||||
|
|
||||||
filename = b
|
|
||||||
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
|
||||||
|
|
||||||
# upload file to gd
|
# upload file to gd
|
||||||
|
logger.debug(f'uploading {filename=} to folder id {upload_to}')
|
||||||
file_metadata = {
|
file_metadata = {
|
||||||
'name': [filename],
|
'name': [filename],
|
||||||
'parents': [folder_id_to_upload_to]
|
'parents': [upload_to]
|
||||||
}
|
}
|
||||||
media = MediaFileUpload(file, resumable=True)
|
media = MediaFileUpload(file, resumable=True)
|
||||||
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={folder_id_to_upload_to}')
|
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
|
||||||
|
|
||||||
def upload(self, filename: str, key: str, **kwargs):
|
def upload(self, filename: str, key: str, **kwargs):
|
||||||
# GD only requires the filename not a file reader
|
# GD only requires the filename not a file reader
|
||||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
|
||||||
self.uploadf(filename, key, **kwargs)
|
self.uploadf(filename, key, **kwargs)
|
||||||
|
|
||||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True):
|
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
|
||||||
"""
|
"""
|
||||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||||
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||||
If @raise_on_missing will throw error when not found, or returns None
|
If @raise_on_missing will throw error when not found, or returns None
|
||||||
|
Will remember previous calls to avoid duplication if @use_cache
|
||||||
Returns the id of the file or folder from its name as a string
|
Returns the id of the file or folder from its name as a string
|
||||||
"""
|
"""
|
||||||
|
# cache logic
|
||||||
|
if use_cache:
|
||||||
|
self.api_cache = getattr(self, "api_cache", {})
|
||||||
|
cache_key = f"{parent_id}_{name}_{use_mime_type}"
|
||||||
|
if cache_key in self.api_cache:
|
||||||
|
logger.debug(f"cache hit for {cache_key=}")
|
||||||
|
return self.api_cache[cache_key]
|
||||||
|
|
||||||
|
# API logic
|
||||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||||
query_string = f"'{parent_id}' in parents and name = '{name}' "
|
query_string = f"'{parent_id}' in parents and name = '{name}' "
|
||||||
if use_mime_type:
|
if use_mime_type:
|
||||||
|
@ -115,10 +110,14 @@ class GDStorage(Storage):
|
||||||
|
|
||||||
if len(items) > 0:
|
if len(items) > 0:
|
||||||
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
||||||
return items[-1]['id']
|
_id = items[-1]['id']
|
||||||
|
if use_cache: self.api_cache[cache_key] = _id
|
||||||
|
return _id
|
||||||
else:
|
else:
|
||||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}. sleeping for {sleep_seconds} second(s)')
|
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
|
||||||
if attempt < retries - 1: time.sleep(sleep_seconds)
|
if attempt < retries - 1:
|
||||||
|
logger.debug(f'sleeping for {sleep_seconds} second(s)')
|
||||||
|
time.sleep(sleep_seconds)
|
||||||
|
|
||||||
if raise_on_missing:
|
if raise_on_missing:
|
||||||
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
||||||
|
@ -129,7 +128,7 @@ class GDStorage(Storage):
|
||||||
Creates a new GDrive folder @name inside folder @parent_id
|
Creates a new GDrive folder @name inside folder @parent_id
|
||||||
Returns id of the created folder
|
Returns id of the created folder
|
||||||
"""
|
"""
|
||||||
logger.debug(f'[_mkdir] Creating new folder with {name=} inside {parent_id=}')
|
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
|
||||||
file_metadata = {
|
file_metadata = {
|
||||||
'name': [name],
|
'name': [name],
|
||||||
'mimeType': 'application/vnd.google-apps.folder',
|
'mimeType': 'application/vnd.google-apps.folder',
|
||||||
|
|
|
@ -1,13 +1,26 @@
|
||||||
import os
|
import os
|
||||||
from .base_storage import Storage
|
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from .base_storage import Storage
|
||||||
|
from utils import mkdir_if_not_exists
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LocalConfig:
|
||||||
|
folder: str = ""
|
||||||
|
save_to: str = "./"
|
||||||
|
|
||||||
class LocalStorage(Storage):
|
class LocalStorage(Storage):
|
||||||
def __init__(self, folder):
|
def __init__(self, config:LocalConfig):
|
||||||
self.folder = self._clean_path(folder)
|
self.folder = self._clean_path(config.folder)
|
||||||
|
self.save_to = self._clean_path(config.save_to)
|
||||||
|
mkdir_if_not_exists(self.save_to)
|
||||||
|
|
||||||
def get_cdn_url(self, key):
|
def get_cdn_url(self, key):
|
||||||
return self.folder + self._clean_path(self.subfolder) + key
|
full_path = os.path.join(self.save_to, self.folder, key)
|
||||||
|
mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
|
||||||
|
return os.path.abspath(full_path)
|
||||||
|
|
||||||
def exists(self, key):
|
def exists(self, key):
|
||||||
return os.path.isfile(self.get_cdn_url(key))
|
return os.path.isfile(self.get_cdn_url(key))
|
||||||
|
|
|
@ -20,8 +20,6 @@ class S3Config:
|
||||||
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||||
private: bool = False
|
private: bool = False
|
||||||
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
|
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
|
||||||
no_folder: bool = False # when true folders are not used for url path
|
|
||||||
|
|
||||||
|
|
||||||
class S3Storage(Storage):
|
class S3Storage(Storage):
|
||||||
|
|
||||||
|
@ -54,7 +52,7 @@ class S3Storage(Storage):
|
||||||
ext = os.path.splitext(key)[1]
|
ext = os.path.splitext(key)[1]
|
||||||
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
|
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
|
||||||
final_key = self.key_dict[key]
|
final_key = self.key_dict[key]
|
||||||
return self.folder + self._clean_path(self.subfolder) + final_key
|
return os.path.join(self.folder, final_key)
|
||||||
|
|
||||||
def get_cdn_url(self, key):
|
def get_cdn_url(self, key):
|
||||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
|
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
|
||||||
|
|
|
@ -10,10 +10,10 @@ class GWorksheet:
|
||||||
"""
|
"""
|
||||||
COLUMN_NAMES = {
|
COLUMN_NAMES = {
|
||||||
'url': 'link',
|
'url': 'link',
|
||||||
'subfolder': 'sub folder',
|
'status': 'archive status',
|
||||||
|
'folder': 'destination folder',
|
||||||
'archive': 'archive location',
|
'archive': 'archive location',
|
||||||
'date': 'archive date',
|
'date': 'archive date',
|
||||||
'status': 'archive status',
|
|
||||||
'thumbnail': 'thumbnail',
|
'thumbnail': 'thumbnail',
|
||||||
'thumbnail_index': 'thumbnail index',
|
'thumbnail_index': 'thumbnail index',
|
||||||
'timestamp': 'upload timestamp',
|
'timestamp': 'upload timestamp',
|
||||||
|
@ -72,12 +72,15 @@ class GWorksheet:
|
||||||
return ''
|
return ''
|
||||||
return row[col_index]
|
return row[col_index]
|
||||||
|
|
||||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False):
|
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
||||||
"""
|
"""
|
||||||
return self.get_cell or default value on error (eg: column is missing)
|
return self.get_cell or default value on error (eg: column is missing)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return self.get_cell(row, col, fresh)
|
val = self.get_cell(row, col, fresh)
|
||||||
|
if when_empty_use_default and val.strip() == "":
|
||||||
|
return default
|
||||||
|
return val
|
||||||
except:
|
except:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
|
|
||||||
import os, requests
|
import os, sys, requests
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
def mkdir_if_not_exists(folder):
|
def mkdir_if_not_exists(folder):
|
||||||
if not os.path.exists(folder):
|
if not os.path.exists(folder):
|
||||||
os.mkdir(folder)
|
os.makedirs(folder)
|
||||||
|
|
||||||
|
|
||||||
def expand_url(url):
|
def expand_url(url):
|
||||||
|
@ -18,3 +18,11 @@ def expand_url(url):
|
||||||
except:
|
except:
|
||||||
logger.error(f'Failed to expand url {url}')
|
logger.error(f'Failed to expand url {url}')
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def getattr_or(o: object, prop: str, default: None = None):
|
||||||
|
try:
|
||||||
|
res = getattr(o, prop)
|
||||||
|
if res is None: raise
|
||||||
|
return res
|
||||||
|
except:
|
||||||
|
return default
|
Ładowanie…
Reference in New Issue