final naming cleanup + new feeders/dbs

pull/72/head
msramalho 2023-01-21 19:44:12 +00:00
rodzic 753039240f
commit b763fc4188
34 zmienionych plików z 322 dodań i 927 usunięć

3
.gitignore vendored
Wyświetl plik

@ -26,4 +26,5 @@ instaloader/*
instaloader.session
orchestration.yaml
auto_archiver.egg-info*
logs*
logs*
*.csv

Wyświetl plik

@ -2,6 +2,6 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut
# need to manually specify due to cyclical deps
from .core.orchestrator import ArchivingOrchestrator
from .core.v2config import ConfigV2
from .core.config import Config
# making accessible directly
from .core.metadata import Metadata

Wyświetl plik

@ -1,8 +1,8 @@
from . import ConfigV2
from . import Config
from . import ArchivingOrchestrator
def main():
config = ConfigV2()
config = Config()
config.parse()
orchestrator = ArchivingOrchestrator(config)
orchestrator.feed()

Wyświetl plik

@ -10,12 +10,12 @@
# from .twitter_api_archiver import TwitterApiArchiver
# from .instagram_archiver import InstagramArchiver
from .archiver import Archiverv2
from .telethon_archiverv2 import TelethonArchiver
from .twitter_archiverv2 import TwitterArchiver
from .twitter_api_archiverv2 import TwitterApiArchiver
from .instagram_archiverv2 import InstagramArchiver
from .tiktok_archiverv2 import TiktokArchiver
from .telegram_archiverv2 import TelegramArchiver
from .vk_archiverv2 import VkArchiver
from .youtubedl_archiverv2 import YoutubeDLArchiver
from .archiver import Archiver
from .telethon_archiver import TelethonArchiver
from .twitter_archiver import TwitterArchiver
from .twitter_api_archiver import TwitterApiArchiver
from .instagram_archiver import InstagramArchiver
from .tiktok_archiver import TiktokArchiver
from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver
from .youtubedl_archiver import YoutubeDLArchiver

Wyświetl plik

@ -8,16 +8,16 @@ from ..core import Step
@dataclass
class Archiverv2(Step):
class Archiver(Step):
name = "archiver"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def init(name: str, config: dict) -> Archiverv2:
def init(name: str, config: dict) -> Archiver:
# only for typing...
return Step.init(name, config, Archiverv2)
return Step.init(name, config, Archiver)
def setup(self) -> None:
# used when archivers need to login or do other one-time setup

Wyświetl plik

@ -1,384 +0,0 @@
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from random import randrange
from collections import defaultdict
import ffmpeg
from loguru import logger
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from slugify import slugify
from ..configs import Config
from storages import Storage
from utils import mkdir_if_not_exists
@dataclass
class ArchiveResult:
status: str
cdn_url: str = None
thumbnail: str = None
thumbnail_index: str = None
duration: float = None
title: str = None
timestamp: datetime.datetime = None
screenshot: str = None
wacz: str = None
hash: str = None
media: list = field(default_factory=list)
class Archiver(ABC):
name = "default"
retry_regex = r"retrying at (\d+)$"
def __init__(self, storage: Storage, config: Config):
self.storage = storage
self.driver = config.webdriver
self.hash_algorithm = config.hash_algorithm
self.browsertrix = config.browsertrix_config
self.is_docker = config.is_docker
self.media = []
def __str__(self):
return self.__class__.__name__
def __repr__(self):
return self.__str__()
@abstractmethod
def download(self, url, check_if_exists=False): pass
def generateArchiveResult(self, **kwargs):
# remove duplicates
if "cdn_url" in kwargs:
self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
return ArchiveResult(**kwargs)
def get_netloc(self, url):
return urlparse(url).netloc
def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
if key: media_info["key"] = key
if hash: media_info["hash"] = hash
self.media.append(media_info)
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
"""
Generates an index.html page where each @urls_info is displayed
"""
for ui in urls_info:
self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
<body>
<h2>Archived media from {self.name}</h2>
<h3><a href="{url}">{url}</a></h3><ul>'''
for url_info in urls_info:
mime_global = self._guess_file_type(url_info["key"])
preview = ""
if mime_global == "image":
preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
elif mime_global == "video":
preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
page += f"</body></html>"
page_key = self.get_html_key(url)
page_filename = os.path.join(Storage.TMP_FOLDER, page_key)
with open(page_filename, "w") as f:
f.write(page)
page_hash = self.get_hash(page_filename)
self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
page_cdn = self.storage.get_cdn_url(page_key)
return (page_cdn, page_hash, thumbnail)
def _guess_file_type(self, path: str):
"""
Receives a URL or filename and returns global mimetype like 'image' or 'video'
see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
"""
mime = mimetypes.guess_type(path)[0]
if mime is not None:
return mime.split("/")[0]
return ""
def download_from_url(self, url, to_filename):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
d = requests.get(url, headers=headers)
with open(to_filename, 'wb') as f:
f.write(d.content)
def generate_media_page(self, urls, url, object):
"""
For a list of media urls, fetch them, upload them
and call self.generate_media_page_html with them
"""
for media_url in urls:
self.add_to_media(media_url)
thumbnail = None
uploaded_media = []
for media_url in urls:
key = self._get_key_from_url(media_url, ".jpg")
filename = os.path.join(Storage.TMP_FOLDER, key)
self.download_from_url(media_url, filename)
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
if thumbnail is None:
thumbnail = cdn_url
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
def get_key(self, filename):
"""
returns a key in the format "[archiverName]_[filename]" includes extension
"""
tail = os.path.split(filename)[1] # returns filename.ext from full path
_id, extension = os.path.splitext(tail) # returns [filename, .ext]
if 'unknown_video' in _id:
_id = _id.replace('unknown_video', 'jpg')
# long filenames can cause problems, so trim them if necessary
if len(_id) > 128:
_id = _id[-128:]
return f'{self.name}_{_id}{extension}'
def get_html_key(self, url):
return self._get_key_from_url(url, ".html")
def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
"""
Receives a URL and returns a slugified version of the URL path
if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
if @append_date is true, the key adds a timestamp after the URL slug and before the extension
"""
url_path = urlparse(url).path
path, ext = os.path.splitext(url_path)
slug = slugify(path)
if append_datetime:
slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
if len(ext):
slug += ext
if with_extension is not None:
if "." not in slug:
slug += with_extension
return self.get_key(slug)
def get_hash(self, filename):
with open(filename, "rb") as f:
bytes = f.read() # read entire file as bytes
logger.debug(f'Hash algorithm is {self.hash_algorithm}')
if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
return hash.hexdigest()
def get_screenshot(self, url):
logger.debug(f"getting screenshot for {url=}")
key = self._get_key_from_url(url, ".png", append_datetime=True)
filename = os.path.join(Storage.TMP_FOLDER, key)
# Accept cookies popup dismiss for ytdlp video
if 'facebook.com' in url:
try:
logger.debug(f'Trying fb click accept cookie popup for {url}')
self.driver.get("http://www.facebook.com")
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies for url {url}')
try:
self.driver.get(url)
time.sleep(6)
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
cdn_url = self.storage.get_cdn_url(key)
self.add_to_media(cdn_url, key)
return cdn_url
def get_wacz(self, url):
if not self.browsertrix.enabled:
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
return
if self.is_docker:
# TODO: figure out support for browsertrix in docker
# see: https://github.com/bellingcat/auto-archiver/issues/66
logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
return
logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.browsertrix.timeout_seconds),
"--timeout", str(self.browsertrix.timeout_seconds)
]
if not os.path.isdir(browsertrix_home):
os.mkdir(browsertrix_home)
if self.browsertrix.profile:
shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
cmd.extend(["--profile", "/crawls/profile.tar.gz"])
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
except Exception as e:
logger.error(f"WACZ generation failed: {e}")
return
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
# do not crash if upload fails
try:
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'application/zip'})
except FileNotFoundError as e:
logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
# clean up the local browsertrix files
try:
shutil.rmtree(browsertrix_home)
except PermissionError:
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
cdn_url = self.storage.get_cdn_url(key)
self.add_to_media(cdn_url, key)
return cdn_url
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
key_folder = key.split('.')[0] + os.path.sep
mkdir_if_not_exists(thumbnails_folder)
fps = 0.5
if duration is not None:
duration = float(duration)
if duration < 60:
fps = 10.0 / duration
elif duration < 120:
fps = 20.0 / duration
else:
fps = 40.0 / duration
stream = ffmpeg.input(filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(thumbnails_folder + 'out%d.jpg').run()
thumbnails = os.listdir(thumbnails_folder)
cdn_urls = []
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = thumbnails_folder + fname
key = os.path.join(key_folder, fname)
self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)
cdn_urls.append(cdn_url)
if len(cdn_urls) == 0:
return ('', '')
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>
<body>'''
for t in cdn_urls:
index_page += f'<img src="{t}" />'
index_page += f"</body></html>"
index_fname = thumbnails_folder + 'index.html'
with open(index_fname, 'w') as f:
f.write(index_page)
thumb_index = key_folder + 'index.html'
self.storage.upload(index_fname, thumb_index, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
shutil.rmtree(thumbnails_folder)
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)
def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
"""
sets state to retry in random between (min_seconds, max_seconds)
"""
now = datetime.datetime.now().timestamp()
retry_at = int(now + randrange(min_seconds, max_seconds))
logger.debug(f"signaling {retry_at=}")
return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
def is_retry(status):
return re.search(Archiver.retry_regex, status) is not None
def should_retry_from_status(status):
"""
checks status against message in signal_retry_in
returns true if enough time has elapsed, false otherwise
"""
match = re.search(Archiver.retry_regex, status)
if match:
retry_at = int(match.group(1))
now = datetime.datetime.now().timestamp()
should_retry = now >= retry_at
logger.debug(f"{should_retry=} since {now=} and {retry_at=}")
return should_retry
return False
def remove_retry(status):
"""
transforms the status from retry into something else
"""
new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
logger.debug(f"removing retry message at {status=}, got {new_status=}")
return new_status

Wyświetl plik

@ -2,11 +2,11 @@ import re, os, shutil, html, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger
from . import Archiverv2
from . import Archiver
from ..core import Metadata
from ..core import Media
class InstagramArchiver(Archiverv2):
class InstagramArchiver(Archiver):
"""
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""

Wyświetl plik

@ -4,12 +4,12 @@ import html
from bs4 import BeautifulSoup
from loguru import logger
from . import Archiverv2
from . import Archiver
from ..core import Metadata
from ..core import Media
class TelegramArchiver(Archiverv2):
class TelegramArchiver(Archiver):
"""
Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
"""

Wyświetl plik

@ -8,12 +8,12 @@ from loguru import logger
from tqdm import tqdm
import re, time, json, os
from . import Archiverv2
from . import Archiver
from ..core import Metadata
from ..core import Media
class TelethonArchiver(Archiverv2):
class TelethonArchiver(Archiver):
name = "telethon_archiver"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")

Wyświetl plik

@ -5,12 +5,12 @@ import uuid
import tiktok_downloader
from loguru import logger
from . import Archiverv2
from . import Archiver
from ..core import Metadata
from ..core import Media
class TiktokArchiver(Archiverv2):
class TiktokArchiver(Archiver):
name = "tiktok_archiver"
def __init__(self, config: dict) -> None:

Wyświetl plik

@ -7,13 +7,13 @@ from loguru import logger
from pytwitter import Api
from slugify import slugify
from . import Archiverv2
from .twitter_archiverv2 import TwitterArchiver
from . import Archiver
from .twitter_archiver import TwitterArchiver
from ..core import Metadata
from ..core import Media
class TwitterApiArchiver(TwitterArchiver, Archiverv2):
class TwitterApiArchiver(TwitterArchiver, Archiver):
name = "twitter_api_archiver"
def __init__(self, config: dict) -> None:

Wyświetl plik

@ -7,11 +7,11 @@ from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from slugify import slugify
from . import Archiverv2
from . import Archiver
from ..core import Metadata
from ..core import Media
class TwitterArchiver(Archiverv2):
class TwitterArchiver(Archiver):
"""
This Twitter Archiver uses unofficial scraping methods.
"""

Wyświetl plik

@ -2,12 +2,12 @@ from loguru import logger
from vk_url_scraper import VkScraper
from ..utils.misc import dump_payload
from . import Archiverv2
from . import Archiver
from ..core import Metadata
from ..core import Media
class VkArchiver(Archiverv2):
class VkArchiver(Archiver):
""""
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts

Wyświetl plik

@ -4,12 +4,12 @@ import os
import yt_dlp
from loguru import logger
from . import Archiverv2
from . import Archiver
from ..core import Metadata
from ..core import Media
class YoutubeDLArchiver(Archiverv2):
class YoutubeDLArchiver(Archiver):
name = "youtubedl_enricher"
def __init__(self, config: dict) -> None:

Wyświetl plik

@ -1,176 +0,0 @@
import os, datetime, traceback, random, tempfile
from loguru import logger
from slugify import slugify
from urllib.parse import quote
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver
from utils import GWorksheet, expand_url
from configs import Config
from storages import Storage
random.seed()
def update_sheet(gw, row, url, result: ArchiveResult):
cell_updates = []
row_values = gw.get_row(row)
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
cell_updates.append((row, col, final_value))
cell_updates.append((row, 'status', result.status))
batch_if_valid('archive', result.cdn_url)
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail_index', result.thumbnail_index)
batch_if_valid('title', result.title)
batch_if_valid('duration', result.duration, str(result.duration))
batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash)
if result.wacz is not None:
batch_if_valid('wacz', result.wacz)
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
if result.timestamp is not None:
if type(result.timestamp) == int:
timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
elif type(result.timestamp) == str:
timestamp_string = result.timestamp
else:
timestamp_string = result.timestamp.isoformat()
batch_if_valid('timestamp', timestamp_string)
gw.batch_set_cell(cell_updates)
def missing_required_columns(gw: GWorksheet):
missing = False
for required_col in ['url', 'status']:
if not gw.col_exists(required_col):
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
missing = True
return missing
def should_process_sheet(c: Config, sheet_name):
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
# ALLOW rules exist AND sheet name not explicitly allowed
return False
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
# BLOCK rules exist AND sheet name is blocked
return False
return True
def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
url = expand_url(url)
c.set_folder(folder)
storage = c.get_storage()
# make a new driver so each spreadsheet row is idempotent
c.recreate_webdriver()
# order matters, first to succeed excludes remaining
active_archivers = [
TelethonArchiver(storage, c),
TiktokArchiver(storage, c),
TwitterApiArchiver(storage, c),
InstagramArchiver(storage, c),
YoutubeDLArchiver(storage, c),
TelegramArchiver(storage, c),
TwitterArchiver(storage, c),
VkArchiver(storage, c),
WaybackArchiver(storage, c)
]
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on {debug_string}')
try:
result = archiver.download(url, check_if_exists=c.check_if_exists)
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except Exception as e:
result = False
logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
if result:
success = result.status in ['success', 'already archived']
result.status = f"{archiver.name}: {result.status}"
if success:
logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
break
# only 1 retry possible for now
if is_retry and Archiver.is_retry(result.status):
result.status = Archiver.remove_retry(result.status)
logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
return result
def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):
if not should_process_sheet(c, wks.title):
logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
continue
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
if missing_required_columns(gw): continue
# archives will default to being in a folder 'doc_name/worksheet_name'
default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
c.set_folder(default_folder)
storage = c.get_storage()
# loop through rows in worksheet
for row in range(1 + c.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
is_retry = False
if url == '' or status not in ['', None]:
is_retry = Archiver.should_retry_from_status(status)
if not is_retry: continue
# All checks done - archival process starts here
try:
gw.set_cell(row, 'status', 'Archive in progress')
result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
if result:
update_sheet(gw, row, url, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {row=}, {url=}")
gw.set_cell(row, 'status', '')
c.destroy_webdriver()
exit()
except Exception as e:
logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
logger.success(f'Finished worksheet {wks.title}')
@logger.catch
def main():
c = Config()
c.parse()
logger.info(f'Opening document {c.sheet} for header {c.header}')
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
Storage.TMP_FOLDER = tmpdir
process_sheet(c)
c.destroy_webdriver()
if __name__ == '__main__':
main()

Wyświetl plik

@ -1,29 +1,34 @@
import tempfile
import auto_archive
from loguru import logger
from configs import Config
from storages import Storage
#TODO: refactor GDriveStorage before merging to main
# is it possible to have something like this with the new pipeline?
def main():
c = Config()
c.parse()
logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
gc = c.gsheets_client
sh = gc.open(c.sheet)
wks = sh.get_worksheet(0)
values = wks.get_all_values()
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
Storage.TMP_FOLDER = tmpdir
for i in range(11, len(values)):
c.sheet = values[i][0]
logger.info(f"Processing {c.sheet}")
auto_archive.process_sheet(c)
c.destroy_webdriver()
# # import tempfile
# import auto_archive
# from loguru import logger
# from configs import Config
# from storages import Storage
if __name__ == "__main__":
main()
# def main():
# c = Config()
# c.parse()
# logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
# gc = c.gsheets_client
# sh = gc.open(c.sheet)
# wks = sh.get_worksheet(0)
# values = wks.get_all_values()
# with tempfile.TemporaryDirectory(dir="./") as tmpdir:
# Storage.TMP_FOLDER = tmpdir
# for i in range(11, len(values)):
# c.sheet = values[i][0]
# logger.info(f"Processing {c.sheet}")
# auto_archive.process_sheet(c)
# c.destroy_webdriver()
# if __name__ == "__main__":
# main()

Wyświetl plik

@ -4,4 +4,4 @@ from .step import Step
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator
# from .v2config import ConfigV2
# from .config import Config

Wyświetl plik

@ -5,31 +5,31 @@ from dataclasses import dataclass, field
from typing import List
from collections import defaultdict
from ..archivers import Archiverv2
from ..archivers import Archiver
from ..feeders import Feeder
from ..databases import Database
from ..formatters import Formatter
from ..storages import StorageV2
from ..storages import Storage
from . import Step
from ..enrichers import Enricher
@dataclass
class ConfigV2:
class Config:
# TODO: should Config inherit from Step so it can have it's own configurations?
# these are only detected if they are put to the respective __init__.py
configurable_parents = [
Feeder,
Enricher,
Archiverv2,
Archiver,
Database,
StorageV2,
Storage,
Formatter
# Util
]
feeder: Step # TODO:= BaseFeeder
formatter: Formatter
archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type
archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type
enrichers: List[Enricher] = field(default_factory=[])
storages: List[Step] = field(default_factory=[]) # TODO: fix type
databases: List[Database] = field(default_factory=[])
@ -107,9 +107,9 @@ class ConfigV2:
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config)
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
self.archivers = [Archiverv2.init(e, self.config) for e in (steps.get("archivers") or [])]
self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])]
self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
print("feeder", self.feeder)
print("enrichers", [e for e in self.enrichers])

Wyświetl plik

@ -4,7 +4,7 @@ from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
import datetime, mimetypes
import datetime
from urllib.parse import urlparse
from loguru import logger
from dateutil.parser import parse as parse_dt
@ -17,7 +17,7 @@ class Metadata:
status: str = "no archiver"
_processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set, repr=False) # keys that are not to be saved in DBs
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False

Wyświetl plik

@ -3,10 +3,10 @@ from ast import List
from typing import Union, Dict
from dataclasses import dataclass
from ..archivers import Archiverv2
from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
from ..storages import StorageV2
from ..storages import Storage
from ..enrichers import Enricher
from ..databases import Database
from .media import Media
@ -59,9 +59,9 @@ class ArchivingOrchestrator:
self.feeder: Feeder = config.feeder
self.formatter: Formatter = config.formatter
self.enrichers = config.enrichers
self.archivers: List[Archiverv2] = config.archivers
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[StorageV2] = config.storages
self.storages: List[Storage] = config.storages
for a in self.archivers: a.setup()
@ -69,12 +69,12 @@ class ArchivingOrchestrator:
for item in self.feeder:
self.feed_item(item)
def feed_item(self, item:Metadata) -> Metadata:
def feed_item(self, item: Metadata) -> Metadata:
print("ARCHIVING", item)
try:
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir)
result = self.archive(item)
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
@ -84,8 +84,6 @@ class ArchivingOrchestrator:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases: d.failed(item)
return result
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
@ -154,7 +152,7 @@ class ArchivingOrchestrator:
for prop in m.properties.values():
if isinstance(prop, Media):
s.store(prop, result)
if isinstance(prop, list) and len(prop)>0 and isinstance(prop[0], Media):
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
for prop_media in prop:
s.store(prop_media, result)

Wyświetl plik

@ -28,7 +28,7 @@ class Step(ABC):
for sub in child.__subclasses__():
if sub.name == name:
return sub(config)
raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.")
raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py")
def assert_valid_string(self, prop: str) -> None:
"""

Wyświetl plik

@ -1,3 +1,4 @@
from .database import Database
from .gsheet_db import GsheetsDb
from .console_db import ConsoleDb
from .console_db import ConsoleDb
from .csv_db import CSVDb

Wyświetl plik

@ -0,0 +1,33 @@
import os
from loguru import logger
from . import Database
from ..core import Metadata
from csv import DictWriter
from dataclasses import asdict
class CSVDb(Database):
"""
Outputs results to a CSV file
"""
name = "csv_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.assert_valid_string("csv_file")
@staticmethod
def configs() -> dict:
return {
"csv_file": {"default": "db.csv", "help": "CSV file name"}
}
def done(self, item: Metadata) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item}")
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
with open(self.csv_file, "a", encoding="utf-8") as outf:
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
if is_empty: writer.writeheader()
writer.writerow(asdict(item))

Wyświetl plik

@ -18,7 +18,6 @@ class Database(Step, ABC):
# only for typing...
return Step.init(name, config, Database)
@abstractmethod
def started(self, item: Metadata) -> None:
"""signals the DB that the given item archival has started"""
pass

Wyświetl plik

@ -2,10 +2,10 @@ from loguru import logger
import time, requests
from . import Enricher
from ..archivers import Archiverv2
from ..archivers import Archiver
from ..core import Metadata
class WaybackArchiverEnricher(Enricher, Archiverv2):
class WaybackArchiverEnricher(Enricher, Archiver):
"""
Submits the current URL to the webarchive and returns a job_id or completed archive
"""

Wyświetl plik

@ -1,2 +1,3 @@
from.feeder import Feeder
from .gsheet_feeder import GsheetsFeeder
from .gsheet_feeder import GsheetsFeeder
from .cli_feeder import CLIFeeder

Wyświetl plik

@ -0,0 +1,33 @@
import gspread, os
# from metadata import Metadata
from loguru import logger
# from . import Enricher
from . import Feeder
from ..core import Metadata
class CLIFeeder(Feeder):
name = "cli_feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
assert type(self.urls) == list and len(self.urls) > 0, "Please provide a CSV list of URL(s) to process, with --cli_feeder.urls='url1,url2,url3'"
@staticmethod
def configs() -> dict:
return {
"urls": {
"default": None,
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
},
}
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing {url}")
yield Metadata().set_url(url).set("folder", "cli", True)
logger.success(f"Processed {len(self.urls)} URL(s)")

Wyświetl plik

@ -1,9 +1,3 @@
# we need to explicitly expose the available imports here
from .base_storage import Storage
# from .local_storage import LocalStorage, LocalConfig
# from .s3_storage import S3Config, S3Storage
# from .gd_storage import GDConfig, GDStorage
from .storage import StorageV2
from .s3 import S3StorageV2
from .local import LocalStorageV2
from .storage import Storage
from .s3 import S3Storage
from .local import LocalStorage

Wyświetl plik

@ -1,33 +0,0 @@
import os, uuid
from loguru import logger
from abc import ABC, abstractmethod
from pathlib import Path
class Storage(ABC):
TMP_FOLDER = "tmp/"
@abstractmethod
def __init__(self, config): pass
@abstractmethod
def get_cdn_url(self, key): pass
@abstractmethod
def exists(self, key): pass
@abstractmethod
def uploadf(self, file, key, **kwargs): pass
def clean_key(self, key):
# Some storages does not work well with trailing forward slashes and some keys come with that
if key.startswith('/'):
logger.debug(f'Found and fixed a leading "/" for {key=}')
return key[1:]
return key
def upload(self, filename: str, key: str, **kwargs):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)

Wyświetl plik

@ -1,178 +1,181 @@
import os, time
from loguru import logger
from .base_storage import Storage
from dataclasses import dataclass
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.oauth2 import service_account
#TODO: refactor GDriveStorage before merging to main
# import os, time
# from loguru import logger
# from .base_storage import Storage
# from dataclasses import dataclass
# from googleapiclient.discovery import build
# from googleapiclient.http import MediaFileUpload
# from google.oauth2 import service_account
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
# from google.oauth2.credentials import Credentials
# from google.auth.transport.requests import Request
@dataclass
class GDConfig:
root_folder_id: str
oauth_token_filename: str
service_account: str = "service_account.json"
folder: str = "default"
# @dataclass
# class GDConfig:
# root_folder_id: str
# oauth_token_filename: str
# service_account: str = "service_account.json"
# folder: str = "default"
class GDStorage(Storage):
def __init__(self, config: GDConfig):
self.folder = config.folder
self.root_folder_id = config.root_folder_id
# class GDStorage(Storage):
# def __init__(self, config: GDConfig):
# self.folder = config.folder
# self.root_folder_id = config.root_folder_id
SCOPES=['https://www.googleapis.com/auth/drive']
# SCOPES=['https://www.googleapis.com/auth/drive']
token_file = config.oauth_token_filename
if token_file is not None:
"""
Tokens are refreshed after 1 hour
however keep working for 7 days (tbc)
so as long as the job doesn't last for 7 days
then this method of refreshing only once per run will work
see this link for details on the token
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
"""
logger.debug(f'Using GD OAuth token {token_file}')
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
# token_file = config.oauth_token_filename
# if token_file is not None:
# """
# Tokens are refreshed after 1 hour
# however keep working for 7 days (tbc)
# so as long as the job doesn't last for 7 days
# then this method of refreshing only once per run will work
# see this link for details on the token
# https://davemateer.com/2022/04/28/google-drive-with-python#tokens
# """
# logger.debug(f'Using GD OAuth token {token_file}')
# creds = Credentials.from_authorized_user_file(token_file, SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
logger.debug('Requesting new GD OAuth token')
creds.refresh(Request())
else:
raise Exception("Problem with creds - create the token again")
# if not creds or not creds.valid:
# if creds and creds.expired and creds.refresh_token:
# logger.debug('Requesting new GD OAuth token')
# creds.refresh(Request())
# else:
# raise Exception("Problem with creds - create the token again")
# Save the credentials for the next run
with open(token_file, 'w') as token:
logger.debug('Saving new GD OAuth token')
token.write(creds.to_json())
else:
logger.debug('GD OAuth Token valid')
else:
gd_service_account = config.service_account
logger.debug(f'Using GD Service Account {gd_service_account}')
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
# # Save the credentials for the next run
# with open(token_file, 'w') as token:
# logger.debug('Saving new GD OAuth token')
# token.write(creds.to_json())
# else:
# logger.debug('GD OAuth Token valid')
# else:
# gd_service_account = config.service_account
# logger.debug(f'Using GD Service Account {gd_service_account}')
# creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
self.service = build('drive', 'v3', credentials=creds)
# self.service = build('drive', 'v3', credentials=creds)
def get_cdn_url(self, key):
"""
only support files saved in a folder for GD
S3 supports folder and all stored in the root
"""
key = self.clean_key(key)
# def get_cdn_url(self, key):
# """
# only support files saved in a folder for GD
# S3 supports folder and all stored in the root
# """
# key = self.clean_key(key)
full_name = os.path.join(self.folder, key)
parent_id, folder_id = self.root_folder_id, None
path_parts = full_name.split(os.path.sep)
filename = path_parts[-1]
logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
for folder in path_parts[0:-1]:
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
parent_id = folder_id
# full_name = os.path.join(self.folder, key)
# parent_id, folder_id = self.root_folder_id, None
# path_parts = full_name.split(os.path.sep)
# filename = path_parts[-1]
# logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
# for folder in path_parts[0:-1]:
# folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
# parent_id = folder_id
# get id of file inside folder (or sub folder)
file_id = self._get_id_from_parent_and_name(folder_id, filename)
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
# # get id of file inside folder (or sub folder)
# file_id = self._get_id_from_parent_and_name(folder_id, filename)
# return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
def exists(self, key):
try:
self.get_cdn_url(key)
return True
except: return False
# def exists(self, key):
# try:
# self.get_cdn_url(key)
# return True
# except: return False
def uploadf(self, file: str, key: str, **_kwargs):
"""
1. for each sub-folder in the path check if exists or create
2. upload file to root_id/other_paths.../filename
"""
key = self.clean_key(key)
# def uploadf(self, file: str, key: str, **_kwargs):
# """
# 1. for each sub-folder in the path check if exists or create
# 2. upload file to root_id/other_paths.../filename
# """
# key = self.clean_key(key)
full_name = os.path.join(self.folder, key)
parent_id, upload_to = self.root_folder_id, None
path_parts = full_name.split(os.path.sep)
filename = path_parts[-1]
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
for folder in path_parts[0:-1]:
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
if upload_to is None:
upload_to = self._mkdir(folder, parent_id)
parent_id = upload_to
# full_name = os.path.join(self.folder, key)
# parent_id, upload_to = self.root_folder_id, None
# path_parts = full_name.split(os.path.sep)
# filename = path_parts[-1]
# logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
# for folder in path_parts[0:-1]:
# upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
# if upload_to is None:
# upload_to = self._mkdir(folder, parent_id)
# parent_id = upload_to
# upload file to gd
logger.debug(f'uploading {filename=} to folder id {upload_to}')
file_metadata = {
'name': [filename],
'parents': [upload_to]
}
media = MediaFileUpload(file, resumable=True)
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
# # upload file to gd
# logger.debug(f'uploading {filename=} to folder id {upload_to}')
# file_metadata = {
# 'name': [filename],
# 'parents': [upload_to]
# }
# media = MediaFileUpload(file, resumable=True)
# gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
# logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
def upload(self, filename: str, key: str, **kwargs):
# GD only requires the filename not a file reader
self.uploadf(filename, key, **kwargs)
# def upload(self, filename: str, key: str, **kwargs):
# # GD only requires the filename not a file reader
# self.uploadf(filename, key, **kwargs)
# gets the Drive folderID if it is there
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
"""
Retrieves the id of a folder or file from its @name and the @parent_id folder
Optionally does multiple @retries and sleeps @sleep_seconds between them
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
If @raise_on_missing will throw error when not found, or returns None
Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
Returns the id of the file or folder from its name as a string
"""
# cache logic
if use_cache:
self.api_cache = getattr(self, "api_cache", {})
cache_key = f"{parent_id}_{name}_{use_mime_type}"
if cache_key in self.api_cache:
logger.debug(f"cache hit for {cache_key=}")
return self.api_cache[cache_key]
# # gets the Drive folderID if it is there
# def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
# """
# Retrieves the id of a folder or file from its @name and the @parent_id folder
# Optionally does multiple @retries and sleeps @sleep_seconds between them
# If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
# If @raise_on_missing will throw error when not found, or returns None
# Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
# Returns the id of the file or folder from its name as a string
# """
# # cache logic
# if use_cache:
# self.api_cache = getattr(self, "api_cache", {})
# cache_key = f"{parent_id}_{name}_{use_mime_type}"
# if cache_key in self.api_cache:
# logger.debug(f"cache hit for {cache_key=}")
# return self.api_cache[cache_key]
# API logic
debug_header: str = f"[searching {name=} in {parent_id=}]"
query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
if use_mime_type:
query_string += f" and mimeType='application/vnd.google-apps.folder' "
# # API logic
# debug_header: str = f"[searching {name=} in {parent_id=}]"
# query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
# if use_mime_type:
# query_string += f" and mimeType='application/vnd.google-apps.folder' "
for attempt in range(retries):
results = self.service.files().list(
q=query_string,
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
# for attempt in range(retries):
# results = self.service.files().list(
# q=query_string,
# spaces='drive', # ie not appDataFolder or photos
# fields='files(id, name)'
# ).execute()
# items = results.get('files', [])
if len(items) > 0:
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
_id = items[-1]['id']
if use_cache: self.api_cache[cache_key] = _id
return _id
else:
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
if attempt < retries - 1:
logger.debug(f'sleeping for {sleep_seconds} second(s)')
time.sleep(sleep_seconds)
# if len(items) > 0:
# logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
# _id = items[-1]['id']
# if use_cache: self.api_cache[cache_key] = _id
# return _id
# else:
# logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
# if attempt < retries - 1:
# logger.debug(f'sleeping for {sleep_seconds} second(s)')
# time.sleep(sleep_seconds)
if raise_on_missing:
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
return None
# if raise_on_missing:
# raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
# return None
def _mkdir(self, name: str, parent_id: str):
"""
Creates a new GDrive folder @name inside folder @parent_id
Returns id of the created folder
"""
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
file_metadata = {
'name': [name],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [parent_id]
}
gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
return gd_folder.get('id')
# def _mkdir(self, name: str, parent_id: str):
# """
# Creates a new GDrive folder @name inside folder @parent_id
# Returns id of the created folder
# """
# logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
# file_metadata = {
# 'name': [name],
# 'mimeType': 'application/vnd.google-apps.folder',
# 'parents': [parent_id]
# }
# gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
# return gd_folder.get('id')

Wyświetl plik

@ -8,10 +8,10 @@ from slugify import slugify
from ..core import Metadata
from ..core import Media
from ..storages import StorageV2
from ..storages import Storage
class LocalStorageV2(StorageV2):
class LocalStorage(Storage):
name = "local_storage"
def __init__(self, config: dict) -> None:

Wyświetl plik

@ -4,12 +4,12 @@ import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
from ..core import Metadata
from ..core import Media
from ..storages import StorageV2
from ..storages import Storage
from loguru import logger
from slugify import slugify
class S3StorageV2(StorageV2):
class S3Storage(Storage):
name = "s3_storage"
def __init__(self, config: dict) -> None:

Wyświetl plik

@ -1,80 +0,0 @@
import uuid, os, mimetypes
from dataclasses import dataclass
import boto3
from botocore.errorfactory import ClientError
from .base_storage import Storage
from dataclasses import dataclass
from loguru import logger
@dataclass
class S3Config:
bucket: str
region: str
key: str
secret: str
folder: str = ""
endpoint_url: str = "https://{region}.digitaloceanspaces.com"
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
private: bool = False
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
class S3Storage(Storage):
def __init__(self, config: S3Config):
self.bucket = config.bucket
self.region = config.region
self.folder = config.folder
self.private = config.private
self.cdn_url = config.cdn_url
self.key_path = config.key_path
self.key_dict = {}
self.s3 = boto3.client(
's3',
region_name=config.region,
endpoint_url=config.endpoint_url.format(region=config.region),
aws_access_key_id=config.key,
aws_secret_access_key=config.secret
)
def _get_path(self, key):
"""
Depends on the self.key_path configuration:
* random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID
* default -> defaults to self.folder/key
"""
# defaults to /key
final_key = key
if self.key_path == "random":
if key not in self.key_dict:
ext = os.path.splitext(key)[1]
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
final_key = self.key_dict[key]
return os.path.join(self.folder, final_key)
def get_cdn_url(self, key):
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
def exists(self, key):
try:
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
return True
except ClientError:
return False
def uploadf(self, file, key, **kwargs):
extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read'
if 'ContentType' not in extra_args:
try:
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
except Exception as e:
logger.error(f"Unable to get mimetype for {key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

Wyświetl plik

@ -10,7 +10,7 @@ from slugify import slugify
@dataclass
class StorageV2(Step):
class Storage(Step):
name = "storage"
def __init__(self, config: dict) -> None:
@ -18,8 +18,8 @@ class StorageV2(Step):
super().__init__(config)
# only for typing...
def init(name: str, config: dict) -> StorageV2:
return Step.init(name, config, StorageV2)
def init(name: str, config: dict) -> Storage:
return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None:
self.set_key(media, item)