kopia lustrzana https://github.com/bellingcat/auto-archiver
final naming cleanup + new feeders/dbs
rodzic
753039240f
commit
b763fc4188
|
@ -26,4 +26,5 @@ instaloader/*
|
|||
instaloader.session
|
||||
orchestration.yaml
|
||||
auto_archiver.egg-info*
|
||||
logs*
|
||||
logs*
|
||||
*.csv
|
|
@ -2,6 +2,6 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut
|
|||
|
||||
# need to manually specify due to cyclical deps
|
||||
from .core.orchestrator import ArchivingOrchestrator
|
||||
from .core.v2config import ConfigV2
|
||||
from .core.config import Config
|
||||
# making accessible directly
|
||||
from .core.metadata import Metadata
|
|
@ -1,8 +1,8 @@
|
|||
from . import ConfigV2
|
||||
from . import Config
|
||||
from . import ArchivingOrchestrator
|
||||
|
||||
def main():
|
||||
config = ConfigV2()
|
||||
config = Config()
|
||||
config.parse()
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
orchestrator.feed()
|
||||
|
|
|
@ -10,12 +10,12 @@
|
|||
# from .twitter_api_archiver import TwitterApiArchiver
|
||||
# from .instagram_archiver import InstagramArchiver
|
||||
|
||||
from .archiver import Archiverv2
|
||||
from .telethon_archiverv2 import TelethonArchiver
|
||||
from .twitter_archiverv2 import TwitterArchiver
|
||||
from .twitter_api_archiverv2 import TwitterApiArchiver
|
||||
from .instagram_archiverv2 import InstagramArchiver
|
||||
from .tiktok_archiverv2 import TiktokArchiver
|
||||
from .telegram_archiverv2 import TelegramArchiver
|
||||
from .vk_archiverv2 import VkArchiver
|
||||
from .youtubedl_archiverv2 import YoutubeDLArchiver
|
||||
from .archiver import Archiver
|
||||
from .telethon_archiver import TelethonArchiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
from .twitter_api_archiver import TwitterApiArchiver
|
||||
from .instagram_archiver import InstagramArchiver
|
||||
from .tiktok_archiver import TiktokArchiver
|
||||
from .telegram_archiver import TelegramArchiver
|
||||
from .vk_archiver import VkArchiver
|
||||
from .youtubedl_archiver import YoutubeDLArchiver
|
|
@ -8,16 +8,16 @@ from ..core import Step
|
|||
|
||||
|
||||
@dataclass
|
||||
class Archiverv2(Step):
|
||||
class Archiver(Step):
|
||||
name = "archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def init(name: str, config: dict) -> Archiverv2:
|
||||
def init(name: str, config: dict) -> Archiver:
|
||||
# only for typing...
|
||||
return Step.init(name, config, Archiverv2)
|
||||
return Step.init(name, config, Archiver)
|
||||
|
||||
def setup(self) -> None:
|
||||
# used when archivers need to login or do other one-time setup
|
||||
|
|
|
@ -1,384 +0,0 @@
|
|||
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
|
||||
from dataclasses import dataclass, field
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
from random import randrange
|
||||
from collections import defaultdict
|
||||
|
||||
import ffmpeg
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.by import By
|
||||
from slugify import slugify
|
||||
|
||||
from ..configs import Config
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArchiveResult:
|
||||
status: str
|
||||
cdn_url: str = None
|
||||
thumbnail: str = None
|
||||
thumbnail_index: str = None
|
||||
duration: float = None
|
||||
title: str = None
|
||||
timestamp: datetime.datetime = None
|
||||
screenshot: str = None
|
||||
wacz: str = None
|
||||
hash: str = None
|
||||
media: list = field(default_factory=list)
|
||||
|
||||
class Archiver(ABC):
|
||||
name = "default"
|
||||
retry_regex = r"retrying at (\d+)$"
|
||||
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
self.storage = storage
|
||||
self.driver = config.webdriver
|
||||
self.hash_algorithm = config.hash_algorithm
|
||||
self.browsertrix = config.browsertrix_config
|
||||
self.is_docker = config.is_docker
|
||||
self.media = []
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
@abstractmethod
|
||||
def download(self, url, check_if_exists=False): pass
|
||||
|
||||
def generateArchiveResult(self, **kwargs):
|
||||
# remove duplicates
|
||||
if "cdn_url" in kwargs:
|
||||
self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
|
||||
kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
|
||||
return ArchiveResult(**kwargs)
|
||||
|
||||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
|
||||
def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
|
||||
media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
|
||||
if key: media_info["key"] = key
|
||||
if hash: media_info["hash"] = hash
|
||||
self.media.append(media_info)
|
||||
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||
"""
|
||||
Generates an index.html page where each @urls_info is displayed
|
||||
"""
|
||||
for ui in urls_info:
|
||||
self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
|
||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||
<body>
|
||||
<h2>Archived media from {self.name}</h2>
|
||||
<h3><a href="{url}">{url}</a></h3><ul>'''
|
||||
|
||||
for url_info in urls_info:
|
||||
mime_global = self._guess_file_type(url_info["key"])
|
||||
preview = ""
|
||||
if mime_global == "image":
|
||||
preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
|
||||
elif mime_global == "video":
|
||||
preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
|
||||
page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''
|
||||
|
||||
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
|
||||
page += f"</body></html>"
|
||||
|
||||
page_key = self.get_html_key(url)
|
||||
page_filename = os.path.join(Storage.TMP_FOLDER, page_key)
|
||||
|
||||
with open(page_filename, "w") as f:
|
||||
f.write(page)
|
||||
|
||||
page_hash = self.get_hash(page_filename)
|
||||
|
||||
self.storage.upload(page_filename, page_key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
return (page_cdn, page_hash, thumbnail)
|
||||
|
||||
def _guess_file_type(self, path: str):
|
||||
"""
|
||||
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
||||
see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
|
||||
"""
|
||||
mime = mimetypes.guess_type(path)[0]
|
||||
if mime is not None:
|
||||
return mime.split("/")[0]
|
||||
return ""
|
||||
|
||||
def download_from_url(self, url, to_filename):
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
d = requests.get(url, headers=headers)
|
||||
with open(to_filename, 'wb') as f:
|
||||
f.write(d.content)
|
||||
|
||||
def generate_media_page(self, urls, url, object):
|
||||
"""
|
||||
For a list of media urls, fetch them, upload them
|
||||
and call self.generate_media_page_html with them
|
||||
"""
|
||||
for media_url in urls:
|
||||
self.add_to_media(media_url)
|
||||
|
||||
thumbnail = None
|
||||
uploaded_media = []
|
||||
for media_url in urls:
|
||||
key = self._get_key_from_url(media_url, ".jpg")
|
||||
|
||||
filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
self.download_from_url(media_url, filename)
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
if thumbnail is None:
|
||||
thumbnail = cdn_url
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
|
||||
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
|
||||
|
||||
def get_key(self, filename):
|
||||
"""
|
||||
returns a key in the format "[archiverName]_[filename]" includes extension
|
||||
"""
|
||||
tail = os.path.split(filename)[1] # returns filename.ext from full path
|
||||
_id, extension = os.path.splitext(tail) # returns [filename, .ext]
|
||||
if 'unknown_video' in _id:
|
||||
_id = _id.replace('unknown_video', 'jpg')
|
||||
|
||||
# long filenames can cause problems, so trim them if necessary
|
||||
if len(_id) > 128:
|
||||
_id = _id[-128:]
|
||||
|
||||
return f'{self.name}_{_id}{extension}'
|
||||
|
||||
def get_html_key(self, url):
|
||||
return self._get_key_from_url(url, ".html")
|
||||
|
||||
def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
|
||||
"""
|
||||
Receives a URL and returns a slugified version of the URL path
|
||||
if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
|
||||
if @append_date is true, the key adds a timestamp after the URL slug and before the extension
|
||||
"""
|
||||
url_path = urlparse(url).path
|
||||
path, ext = os.path.splitext(url_path)
|
||||
slug = slugify(path)
|
||||
if append_datetime:
|
||||
slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
|
||||
if len(ext):
|
||||
slug += ext
|
||||
if with_extension is not None:
|
||||
if "." not in slug:
|
||||
slug += with_extension
|
||||
return self.get_key(slug)
|
||||
|
||||
def get_hash(self, filename):
|
||||
with open(filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
logger.debug(f'Hash algorithm is {self.hash_algorithm}')
|
||||
|
||||
if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
|
||||
elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||
else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
|
||||
|
||||
return hash.hexdigest()
|
||||
|
||||
def get_screenshot(self, url):
|
||||
logger.debug(f"getting screenshot for {url=}")
|
||||
key = self._get_key_from_url(url, ".png", append_datetime=True)
|
||||
filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
|
||||
# Accept cookies popup dismiss for ytdlp video
|
||||
if 'facebook.com' in url:
|
||||
try:
|
||||
logger.debug(f'Trying fb click accept cookie popup for {url}')
|
||||
self.driver.get("http://www.facebook.com")
|
||||
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
|
||||
foo.click()
|
||||
logger.debug(f'fb click worked')
|
||||
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
|
||||
time.sleep(2)
|
||||
except:
|
||||
logger.warning(f'Failed on fb accept cookies for url {url}')
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
time.sleep(6)
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
self.driver.save_screenshot(filename)
|
||||
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
self.add_to_media(cdn_url, key)
|
||||
|
||||
return cdn_url
|
||||
|
||||
def get_wacz(self, url):
|
||||
if not self.browsertrix.enabled:
|
||||
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
|
||||
return
|
||||
if self.is_docker:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
# see: https://github.com/bellingcat/auto-archiver/issues/66
|
||||
logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
|
||||
return
|
||||
|
||||
logger.debug(f"getting wacz for {url}")
|
||||
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
||||
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
|
||||
|
||||
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.browsertrix.timeout_seconds),
|
||||
"--timeout", str(self.browsertrix.timeout_seconds)
|
||||
]
|
||||
|
||||
if not os.path.isdir(browsertrix_home):
|
||||
os.mkdir(browsertrix_home)
|
||||
|
||||
if self.browsertrix.profile:
|
||||
shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
|
||||
cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True)
|
||||
except Exception as e:
|
||||
logger.error(f"WACZ generation failed: {e}")
|
||||
return
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
# do not crash if upload fails
|
||||
try:
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
||||
except FileNotFoundError as e:
|
||||
logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
|
||||
|
||||
# clean up the local browsertrix files
|
||||
try:
|
||||
shutil.rmtree(browsertrix_home)
|
||||
except PermissionError:
|
||||
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
self.add_to_media(cdn_url, key)
|
||||
return cdn_url
|
||||
|
||||
def get_thumbnails(self, filename, key, duration=None):
|
||||
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
|
||||
key_folder = key.split('.')[0] + os.path.sep
|
||||
|
||||
mkdir_if_not_exists(thumbnails_folder)
|
||||
|
||||
fps = 0.5
|
||||
if duration is not None:
|
||||
duration = float(duration)
|
||||
|
||||
if duration < 60:
|
||||
fps = 10.0 / duration
|
||||
elif duration < 120:
|
||||
fps = 20.0 / duration
|
||||
else:
|
||||
fps = 40.0 / duration
|
||||
|
||||
stream = ffmpeg.input(filename)
|
||||
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
|
||||
stream.output(thumbnails_folder + 'out%d.jpg').run()
|
||||
|
||||
thumbnails = os.listdir(thumbnails_folder)
|
||||
cdn_urls = []
|
||||
for fname in thumbnails:
|
||||
if fname[-3:] == 'jpg':
|
||||
thumbnail_filename = thumbnails_folder + fname
|
||||
key = os.path.join(key_folder, fname)
|
||||
|
||||
self.storage.upload(thumbnail_filename, key)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
cdn_urls.append(cdn_url)
|
||||
|
||||
if len(cdn_urls) == 0:
|
||||
return ('', '')
|
||||
|
||||
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
|
||||
|
||||
index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>
|
||||
<body>'''
|
||||
|
||||
for t in cdn_urls:
|
||||
index_page += f'<img src="{t}" />'
|
||||
|
||||
index_page += f"</body></html>"
|
||||
index_fname = thumbnails_folder + 'index.html'
|
||||
|
||||
with open(index_fname, 'w') as f:
|
||||
f.write(index_page)
|
||||
|
||||
thumb_index = key_folder + 'index.html'
|
||||
|
||||
self.storage.upload(index_fname, thumb_index, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
shutil.rmtree(thumbnails_folder)
|
||||
|
||||
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
|
||||
|
||||
return (key_thumb, thumb_index_cdn_url)
|
||||
|
||||
def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
|
||||
"""
|
||||
sets state to retry in random between (min_seconds, max_seconds)
|
||||
"""
|
||||
now = datetime.datetime.now().timestamp()
|
||||
retry_at = int(now + randrange(min_seconds, max_seconds))
|
||||
logger.debug(f"signaling {retry_at=}")
|
||||
return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
|
||||
|
||||
def is_retry(status):
|
||||
return re.search(Archiver.retry_regex, status) is not None
|
||||
|
||||
def should_retry_from_status(status):
|
||||
"""
|
||||
checks status against message in signal_retry_in
|
||||
returns true if enough time has elapsed, false otherwise
|
||||
"""
|
||||
match = re.search(Archiver.retry_regex, status)
|
||||
if match:
|
||||
retry_at = int(match.group(1))
|
||||
now = datetime.datetime.now().timestamp()
|
||||
should_retry = now >= retry_at
|
||||
logger.debug(f"{should_retry=} since {now=} and {retry_at=}")
|
||||
return should_retry
|
||||
return False
|
||||
|
||||
def remove_retry(status):
|
||||
"""
|
||||
transforms the status from retry into something else
|
||||
"""
|
||||
new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
|
||||
logger.debug(f"removing retry message at {status=}, got {new_status=}")
|
||||
return new_status
|
|
@ -2,11 +2,11 @@ import re, os, shutil, html, traceback
|
|||
import instaloader # https://instaloader.github.io/as-module.html
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiverv2
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
class InstagramArchiver(Archiverv2):
|
||||
class InstagramArchiver(Archiver):
|
||||
"""
|
||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
"""
|
|
@ -4,12 +4,12 @@ import html
|
|||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiverv2
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TelegramArchiver(Archiverv2):
|
||||
class TelegramArchiver(Archiver):
|
||||
"""
|
||||
Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
|
||||
"""
|
|
@ -8,12 +8,12 @@ from loguru import logger
|
|||
from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
|
||||
from . import Archiverv2
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TelethonArchiver(Archiverv2):
|
||||
class TelethonArchiver(Archiver):
|
||||
name = "telethon_archiver"
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
|
@ -5,12 +5,12 @@ import uuid
|
|||
import tiktok_downloader
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiverv2
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TiktokArchiver(Archiverv2):
|
||||
class TiktokArchiver(Archiver):
|
||||
name = "tiktok_archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
|
@ -7,13 +7,13 @@ from loguru import logger
|
|||
from pytwitter import Api
|
||||
from slugify import slugify
|
||||
|
||||
from . import Archiverv2
|
||||
from .twitter_archiverv2 import TwitterArchiver
|
||||
from . import Archiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class TwitterApiArchiver(TwitterArchiver, Archiverv2):
|
||||
class TwitterApiArchiver(TwitterArchiver, Archiver):
|
||||
name = "twitter_api_archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
|
@ -7,11 +7,11 @@ from loguru import logger
|
|||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
from slugify import slugify
|
||||
|
||||
from . import Archiverv2
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
class TwitterArchiver(Archiverv2):
|
||||
class TwitterArchiver(Archiver):
|
||||
"""
|
||||
This Twitter Archiver uses unofficial scraping methods.
|
||||
"""
|
|
@ -2,12 +2,12 @@ from loguru import logger
|
|||
from vk_url_scraper import VkScraper
|
||||
|
||||
from ..utils.misc import dump_payload
|
||||
from . import Archiverv2
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class VkArchiver(Archiverv2):
|
||||
class VkArchiver(Archiver):
|
||||
""""
|
||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||
Currently only works for /wall posts
|
|
@ -4,12 +4,12 @@ import os
|
|||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiverv2
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiverv2):
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "youtubedl_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
|
@ -1,176 +0,0 @@
|
|||
import os, datetime, traceback, random, tempfile
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
from urllib.parse import quote
|
||||
|
||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver
|
||||
from utils import GWorksheet, expand_url
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
|
||||
random.seed()
|
||||
|
||||
|
||||
def update_sheet(gw, row, url, result: ArchiveResult):
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
||||
cell_updates.append((row, col, final_value))
|
||||
|
||||
cell_updates.append((row, 'status', result.status))
|
||||
|
||||
batch_if_valid('archive', result.cdn_url)
|
||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
||||
batch_if_valid('thumbnail_index', result.thumbnail_index)
|
||||
batch_if_valid('title', result.title)
|
||||
batch_if_valid('duration', result.duration, str(result.duration))
|
||||
batch_if_valid('screenshot', result.screenshot)
|
||||
batch_if_valid('hash', result.hash)
|
||||
if result.wacz is not None:
|
||||
batch_if_valid('wacz', result.wacz)
|
||||
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
||||
|
||||
if result.timestamp is not None:
|
||||
if type(result.timestamp) == int:
|
||||
timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
|
||||
elif type(result.timestamp) == str:
|
||||
timestamp_string = result.timestamp
|
||||
else:
|
||||
timestamp_string = result.timestamp.isoformat()
|
||||
|
||||
batch_if_valid('timestamp', timestamp_string)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
|
||||
def missing_required_columns(gw: GWorksheet):
|
||||
missing = False
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
|
||||
missing = True
|
||||
return missing
|
||||
|
||||
|
||||
def should_process_sheet(c: Config, sheet_name):
|
||||
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
|
||||
url = expand_url(url)
|
||||
c.set_folder(folder)
|
||||
storage = c.get_storage()
|
||||
|
||||
# make a new driver so each spreadsheet row is idempotent
|
||||
c.recreate_webdriver()
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c),
|
||||
TiktokArchiver(storage, c),
|
||||
TwitterApiArchiver(storage, c),
|
||||
InstagramArchiver(storage, c),
|
||||
YoutubeDLArchiver(storage, c),
|
||||
TelegramArchiver(storage, c),
|
||||
TwitterArchiver(storage, c),
|
||||
VkArchiver(storage, c),
|
||||
WaybackArchiver(storage, c)
|
||||
]
|
||||
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on {debug_string}')
|
||||
|
||||
try:
|
||||
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
||||
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
||||
except Exception as e:
|
||||
result = False
|
||||
logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||
|
||||
if result:
|
||||
success = result.status in ['success', 'already archived']
|
||||
result.status = f"{archiver.name}: {result.status}"
|
||||
if success:
|
||||
logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
|
||||
break
|
||||
# only 1 retry possible for now
|
||||
if is_retry and Archiver.is_retry(result.status):
|
||||
result.status = Archiver.remove_retry(result.status)
|
||||
logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
|
||||
return result
|
||||
|
||||
|
||||
def process_sheet(c: Config):
|
||||
sh = c.gsheets_client.open(c.sheet)
|
||||
|
||||
# loop through worksheets to check
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
if not should_process_sheet(c, wks.title):
|
||||
logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
|
||||
continue
|
||||
|
||||
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
|
||||
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
|
||||
|
||||
if missing_required_columns(gw): continue
|
||||
|
||||
# archives will default to being in a folder 'doc_name/worksheet_name'
|
||||
default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
|
||||
c.set_folder(default_folder)
|
||||
storage = c.get_storage()
|
||||
|
||||
# loop through rows in worksheet
|
||||
for row in range(1 + c.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url')
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
|
||||
|
||||
is_retry = False
|
||||
if url == '' or status not in ['', None]:
|
||||
is_retry = Archiver.should_retry_from_status(status)
|
||||
if not is_retry: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
try:
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
|
||||
if result:
|
||||
update_sheet(gw, row, url, result)
|
||||
else:
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {row=}, {url=}")
|
||||
gw.set_cell(row, 'status', '')
|
||||
c.destroy_webdriver()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
|
||||
gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
|
||||
@logger.catch
|
||||
def main():
|
||||
c = Config()
|
||||
c.parse()
|
||||
logger.info(f'Opening document {c.sheet} for header {c.header}')
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||
Storage.TMP_FOLDER = tmpdir
|
||||
process_sheet(c)
|
||||
c.destroy_webdriver()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,29 +1,34 @@
|
|||
import tempfile
|
||||
import auto_archive
|
||||
from loguru import logger
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
|
||||
#TODO: refactor GDriveStorage before merging to main
|
||||
# is it possible to have something like this with the new pipeline?
|
||||
|
||||
|
||||
def main():
|
||||
c = Config()
|
||||
c.parse()
|
||||
logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
|
||||
|
||||
gc = c.gsheets_client
|
||||
sh = gc.open(c.sheet)
|
||||
|
||||
wks = sh.get_worksheet(0)
|
||||
values = wks.get_all_values()
|
||||
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||
Storage.TMP_FOLDER = tmpdir
|
||||
for i in range(11, len(values)):
|
||||
c.sheet = values[i][0]
|
||||
logger.info(f"Processing {c.sheet}")
|
||||
auto_archive.process_sheet(c)
|
||||
c.destroy_webdriver()
|
||||
# # import tempfile
|
||||
# import auto_archive
|
||||
# from loguru import logger
|
||||
# from configs import Config
|
||||
# from storages import Storage
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# def main():
|
||||
# c = Config()
|
||||
# c.parse()
|
||||
# logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
|
||||
|
||||
# gc = c.gsheets_client
|
||||
# sh = gc.open(c.sheet)
|
||||
|
||||
# wks = sh.get_worksheet(0)
|
||||
# values = wks.get_all_values()
|
||||
|
||||
# with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||
# Storage.TMP_FOLDER = tmpdir
|
||||
# for i in range(11, len(values)):
|
||||
# c.sheet = values[i][0]
|
||||
# logger.info(f"Processing {c.sheet}")
|
||||
# auto_archive.process_sheet(c)
|
||||
# c.destroy_webdriver()
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
||||
|
|
|
@ -4,4 +4,4 @@ from .step import Step
|
|||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
# from .v2config import ConfigV2
|
||||
# from .config import Config
|
|
@ -5,31 +5,31 @@ from dataclasses import dataclass, field
|
|||
from typing import List
|
||||
from collections import defaultdict
|
||||
|
||||
from ..archivers import Archiverv2
|
||||
from ..archivers import Archiver
|
||||
from ..feeders import Feeder
|
||||
from ..databases import Database
|
||||
from ..formatters import Formatter
|
||||
from ..storages import StorageV2
|
||||
from ..storages import Storage
|
||||
from . import Step
|
||||
from ..enrichers import Enricher
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigV2:
|
||||
class Config:
|
||||
# TODO: should Config inherit from Step so it can have it's own configurations?
|
||||
# these are only detected if they are put to the respective __init__.py
|
||||
configurable_parents = [
|
||||
Feeder,
|
||||
Enricher,
|
||||
Archiverv2,
|
||||
Archiver,
|
||||
Database,
|
||||
StorageV2,
|
||||
Storage,
|
||||
Formatter
|
||||
# Util
|
||||
]
|
||||
feeder: Step # TODO:= BaseFeeder
|
||||
formatter: Formatter
|
||||
archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type
|
||||
archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type
|
||||
enrichers: List[Enricher] = field(default_factory=[])
|
||||
storages: List[Step] = field(default_factory=[]) # TODO: fix type
|
||||
databases: List[Database] = field(default_factory=[])
|
||||
|
@ -107,9 +107,9 @@ class ConfigV2:
|
|||
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
|
||||
self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config)
|
||||
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
||||
self.archivers = [Archiverv2.init(e, self.config) for e in (steps.get("archivers") or [])]
|
||||
self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
|
||||
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
||||
self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])]
|
||||
self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
|
||||
|
||||
print("feeder", self.feeder)
|
||||
print("enrichers", [e for e in self.enrichers])
|
|
@ -4,7 +4,7 @@ from ast import List, Set
|
|||
from typing import Any, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
import datetime, mimetypes
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
@ -17,7 +17,7 @@ class Metadata:
|
|||
status: str = "no archiver"
|
||||
_processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
tmp_keys: Set[str] = field(default_factory=set, repr=False) # keys that are not to be saved in DBs
|
||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs
|
||||
media: List[Media] = field(default_factory=list)
|
||||
final_media: Media = None # can be overwritten by formatters
|
||||
rearchivable: bool = False
|
||||
|
|
|
@ -3,10 +3,10 @@ from ast import List
|
|||
from typing import Union, Dict
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..archivers import Archiverv2
|
||||
from ..archivers import Archiver
|
||||
from ..feeders import Feeder
|
||||
from ..formatters import Formatter
|
||||
from ..storages import StorageV2
|
||||
from ..storages import Storage
|
||||
from ..enrichers import Enricher
|
||||
from ..databases import Database
|
||||
from .media import Media
|
||||
|
@ -59,9 +59,9 @@ class ArchivingOrchestrator:
|
|||
self.feeder: Feeder = config.feeder
|
||||
self.formatter: Formatter = config.formatter
|
||||
self.enrichers = config.enrichers
|
||||
self.archivers: List[Archiverv2] = config.archivers
|
||||
self.archivers: List[Archiver] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
self.storages: List[StorageV2] = config.storages
|
||||
self.storages: List[Storage] = config.storages
|
||||
|
||||
for a in self.archivers: a.setup()
|
||||
|
||||
|
@ -69,12 +69,12 @@ class ArchivingOrchestrator:
|
|||
for item in self.feeder:
|
||||
self.feed_item(item)
|
||||
|
||||
def feed_item(self, item:Metadata) -> Metadata:
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
print("ARCHIVING", item)
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
item.set_tmp_dir(tmp_dir)
|
||||
result = self.archive(item)
|
||||
return self.archive(item)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {item=}")
|
||||
|
@ -84,8 +84,6 @@ class ArchivingOrchestrator:
|
|||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
for d in self.databases: d.failed(item)
|
||||
|
||||
return result
|
||||
|
||||
# how does this handle the parameters like folder which can be different for each archiver?
|
||||
# the storage needs to know where to archive!!
|
||||
# solution: feeders have context: extra metadata that they can read or ignore,
|
||||
|
@ -154,7 +152,7 @@ class ArchivingOrchestrator:
|
|||
for prop in m.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
s.store(prop, result)
|
||||
if isinstance(prop, list) and len(prop)>0 and isinstance(prop[0], Media):
|
||||
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
|
||||
for prop_media in prop:
|
||||
s.store(prop_media, result)
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ class Step(ABC):
|
|||
for sub in child.__subclasses__():
|
||||
if sub.name == name:
|
||||
return sub(config)
|
||||
raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.")
|
||||
raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py")
|
||||
|
||||
def assert_valid_string(self, prop: str) -> None:
|
||||
"""
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from .database import Database
|
||||
from .gsheet_db import GsheetsDb
|
||||
from .console_db import ConsoleDb
|
||||
from .console_db import ConsoleDb
|
||||
from .csv_db import CSVDb
|
|
@ -0,0 +1,33 @@
|
|||
import os
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
class CSVDb(Database):
|
||||
"""
|
||||
Outputs results to a CSV file
|
||||
"""
|
||||
name = "csv_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("csv_file")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
}
|
||||
|
||||
def done(self, item: Metadata) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
||||
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
|
||||
with open(self.csv_file, "a", encoding="utf-8") as outf:
|
||||
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
|
||||
if is_empty: writer.writeheader()
|
||||
writer.writerow(asdict(item))
|
|
@ -18,7 +18,6 @@ class Database(Step, ABC):
|
|||
# only for typing...
|
||||
return Step.init(name, config, Database)
|
||||
|
||||
@abstractmethod
|
||||
def started(self, item: Metadata) -> None:
|
||||
"""signals the DB that the given item archival has started"""
|
||||
pass
|
||||
|
|
|
@ -2,10 +2,10 @@ from loguru import logger
|
|||
import time, requests
|
||||
|
||||
from . import Enricher
|
||||
from ..archivers import Archiverv2
|
||||
from ..archivers import Archiver
|
||||
from ..core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiverv2):
|
||||
class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
"""
|
||||
Submits the current URL to the webarchive and returns a job_id or completed archive
|
||||
"""
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
from.feeder import Feeder
|
||||
from .gsheet_feeder import GsheetsFeeder
|
||||
from .gsheet_feeder import GsheetsFeeder
|
||||
from .cli_feeder import CLIFeeder
|
|
@ -0,0 +1,33 @@
|
|||
import gspread, os
|
||||
|
||||
# from metadata import Metadata
|
||||
from loguru import logger
|
||||
|
||||
# from . import Enricher
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
name = "cli_feeder"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
assert type(self.urls) == list and len(self.urls) > 0, "Please provide a CSV list of URL(s) to process, with --cli_feeder.urls='url1,url2,url3'"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
}
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url).set("folder", "cli", True)
|
||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
|
@ -1,9 +1,3 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .base_storage import Storage
|
||||
# from .local_storage import LocalStorage, LocalConfig
|
||||
# from .s3_storage import S3Config, S3Storage
|
||||
# from .gd_storage import GDConfig, GDStorage
|
||||
|
||||
from .storage import StorageV2
|
||||
from .s3 import S3StorageV2
|
||||
from .local import LocalStorageV2
|
||||
from .storage import Storage
|
||||
from .s3 import S3Storage
|
||||
from .local import LocalStorage
|
|
@ -1,33 +0,0 @@
|
|||
import os, uuid
|
||||
from loguru import logger
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Storage(ABC):
|
||||
TMP_FOLDER = "tmp/"
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, config): pass
|
||||
|
||||
@abstractmethod
|
||||
def get_cdn_url(self, key): pass
|
||||
|
||||
@abstractmethod
|
||||
def exists(self, key): pass
|
||||
|
||||
@abstractmethod
|
||||
def uploadf(self, file, key, **kwargs): pass
|
||||
|
||||
def clean_key(self, key):
|
||||
# Some storages does not work well with trailing forward slashes and some keys come with that
|
||||
if key.startswith('/'):
|
||||
logger.debug(f'Found and fixed a leading "/" for {key=}')
|
||||
return key[1:]
|
||||
return key
|
||||
|
||||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
with open(filename, 'rb') as f:
|
||||
self.uploadf(f, key, **kwargs)
|
|
@ -1,178 +1,181 @@
|
|||
import os, time
|
||||
|
||||
from loguru import logger
|
||||
from .base_storage import Storage
|
||||
from dataclasses import dataclass
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from google.oauth2 import service_account
|
||||
#TODO: refactor GDriveStorage before merging to main
|
||||
|
||||
# import os, time
|
||||
|
||||
# from loguru import logger
|
||||
# from .base_storage import Storage
|
||||
# from dataclasses import dataclass
|
||||
# from googleapiclient.discovery import build
|
||||
# from googleapiclient.http import MediaFileUpload
|
||||
# from google.oauth2 import service_account
|
||||
|
||||
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google.auth.transport.requests import Request
|
||||
# from google.oauth2.credentials import Credentials
|
||||
# from google.auth.transport.requests import Request
|
||||
|
||||
@dataclass
|
||||
class GDConfig:
|
||||
root_folder_id: str
|
||||
oauth_token_filename: str
|
||||
service_account: str = "service_account.json"
|
||||
folder: str = "default"
|
||||
# @dataclass
|
||||
# class GDConfig:
|
||||
# root_folder_id: str
|
||||
# oauth_token_filename: str
|
||||
# service_account: str = "service_account.json"
|
||||
# folder: str = "default"
|
||||
|
||||
class GDStorage(Storage):
|
||||
def __init__(self, config: GDConfig):
|
||||
self.folder = config.folder
|
||||
self.root_folder_id = config.root_folder_id
|
||||
# class GDStorage(Storage):
|
||||
# def __init__(self, config: GDConfig):
|
||||
# self.folder = config.folder
|
||||
# self.root_folder_id = config.root_folder_id
|
||||
|
||||
SCOPES=['https://www.googleapis.com/auth/drive']
|
||||
# SCOPES=['https://www.googleapis.com/auth/drive']
|
||||
|
||||
token_file = config.oauth_token_filename
|
||||
if token_file is not None:
|
||||
"""
|
||||
Tokens are refreshed after 1 hour
|
||||
however keep working for 7 days (tbc)
|
||||
so as long as the job doesn't last for 7 days
|
||||
then this method of refreshing only once per run will work
|
||||
see this link for details on the token
|
||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
"""
|
||||
logger.debug(f'Using GD OAuth token {token_file}')
|
||||
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
|
||||
# token_file = config.oauth_token_filename
|
||||
# if token_file is not None:
|
||||
# """
|
||||
# Tokens are refreshed after 1 hour
|
||||
# however keep working for 7 days (tbc)
|
||||
# so as long as the job doesn't last for 7 days
|
||||
# then this method of refreshing only once per run will work
|
||||
# see this link for details on the token
|
||||
# https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
# """
|
||||
# logger.debug(f'Using GD OAuth token {token_file}')
|
||||
# creds = Credentials.from_authorized_user_file(token_file, SCOPES)
|
||||
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
logger.debug('Requesting new GD OAuth token')
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
raise Exception("Problem with creds - create the token again")
|
||||
# if not creds or not creds.valid:
|
||||
# if creds and creds.expired and creds.refresh_token:
|
||||
# logger.debug('Requesting new GD OAuth token')
|
||||
# creds.refresh(Request())
|
||||
# else:
|
||||
# raise Exception("Problem with creds - create the token again")
|
||||
|
||||
# Save the credentials for the next run
|
||||
with open(token_file, 'w') as token:
|
||||
logger.debug('Saving new GD OAuth token')
|
||||
token.write(creds.to_json())
|
||||
else:
|
||||
logger.debug('GD OAuth Token valid')
|
||||
else:
|
||||
gd_service_account = config.service_account
|
||||
logger.debug(f'Using GD Service Account {gd_service_account}')
|
||||
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
|
||||
# # Save the credentials for the next run
|
||||
# with open(token_file, 'w') as token:
|
||||
# logger.debug('Saving new GD OAuth token')
|
||||
# token.write(creds.to_json())
|
||||
# else:
|
||||
# logger.debug('GD OAuth Token valid')
|
||||
# else:
|
||||
# gd_service_account = config.service_account
|
||||
# logger.debug(f'Using GD Service Account {gd_service_account}')
|
||||
# creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
|
||||
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
# self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
"""
|
||||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
key = self.clean_key(key)
|
||||
# def get_cdn_url(self, key):
|
||||
# """
|
||||
# only support files saved in a folder for GD
|
||||
# S3 supports folder and all stored in the root
|
||||
# """
|
||||
# key = self.clean_key(key)
|
||||
|
||||
full_name = os.path.join(self.folder, key)
|
||||
parent_id, folder_id = self.root_folder_id, None
|
||||
path_parts = full_name.split(os.path.sep)
|
||||
filename = path_parts[-1]
|
||||
logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
|
||||
for folder in path_parts[0:-1]:
|
||||
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||
parent_id = folder_id
|
||||
# full_name = os.path.join(self.folder, key)
|
||||
# parent_id, folder_id = self.root_folder_id, None
|
||||
# path_parts = full_name.split(os.path.sep)
|
||||
# filename = path_parts[-1]
|
||||
# logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
|
||||
# for folder in path_parts[0:-1]:
|
||||
# folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||
# parent_id = folder_id
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
# # get id of file inside folder (or sub folder)
|
||||
# file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
||||
# return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
def exists(self, key):
|
||||
try:
|
||||
self.get_cdn_url(key)
|
||||
return True
|
||||
except: return False
|
||||
# def exists(self, key):
|
||||
# try:
|
||||
# self.get_cdn_url(key)
|
||||
# return True
|
||||
# except: return False
|
||||
|
||||
def uploadf(self, file: str, key: str, **_kwargs):
|
||||
"""
|
||||
1. for each sub-folder in the path check if exists or create
|
||||
2. upload file to root_id/other_paths.../filename
|
||||
"""
|
||||
key = self.clean_key(key)
|
||||
# def uploadf(self, file: str, key: str, **_kwargs):
|
||||
# """
|
||||
# 1. for each sub-folder in the path check if exists or create
|
||||
# 2. upload file to root_id/other_paths.../filename
|
||||
# """
|
||||
# key = self.clean_key(key)
|
||||
|
||||
full_name = os.path.join(self.folder, key)
|
||||
parent_id, upload_to = self.root_folder_id, None
|
||||
path_parts = full_name.split(os.path.sep)
|
||||
filename = path_parts[-1]
|
||||
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
|
||||
for folder in path_parts[0:-1]:
|
||||
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
||||
if upload_to is None:
|
||||
upload_to = self._mkdir(folder, parent_id)
|
||||
parent_id = upload_to
|
||||
# full_name = os.path.join(self.folder, key)
|
||||
# parent_id, upload_to = self.root_folder_id, None
|
||||
# path_parts = full_name.split(os.path.sep)
|
||||
# filename = path_parts[-1]
|
||||
# logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
|
||||
# for folder in path_parts[0:-1]:
|
||||
# upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
||||
# if upload_to is None:
|
||||
# upload_to = self._mkdir(folder, parent_id)
|
||||
# parent_id = upload_to
|
||||
|
||||
# upload file to gd
|
||||
logger.debug(f'uploading {filename=} to folder id {upload_to}')
|
||||
file_metadata = {
|
||||
'name': [filename],
|
||||
'parents': [upload_to]
|
||||
}
|
||||
media = MediaFileUpload(file, resumable=True)
|
||||
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
|
||||
# # upload file to gd
|
||||
# logger.debug(f'uploading {filename=} to folder id {upload_to}')
|
||||
# file_metadata = {
|
||||
# 'name': [filename],
|
||||
# 'parents': [upload_to]
|
||||
# }
|
||||
# media = MediaFileUpload(file, resumable=True)
|
||||
# gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||
# logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
|
||||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
# GD only requires the filename not a file reader
|
||||
self.uploadf(filename, key, **kwargs)
|
||||
# def upload(self, filename: str, key: str, **kwargs):
|
||||
# # GD only requires the filename not a file reader
|
||||
# self.uploadf(filename, key, **kwargs)
|
||||
|
||||
# gets the Drive folderID if it is there
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
||||
"""
|
||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||
If @raise_on_missing will throw error when not found, or returns None
|
||||
Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
|
||||
Returns the id of the file or folder from its name as a string
|
||||
"""
|
||||
# cache logic
|
||||
if use_cache:
|
||||
self.api_cache = getattr(self, "api_cache", {})
|
||||
cache_key = f"{parent_id}_{name}_{use_mime_type}"
|
||||
if cache_key in self.api_cache:
|
||||
logger.debug(f"cache hit for {cache_key=}")
|
||||
return self.api_cache[cache_key]
|
||||
# # gets the Drive folderID if it is there
|
||||
# def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
||||
# """
|
||||
# Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
# Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
# If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||
# If @raise_on_missing will throw error when not found, or returns None
|
||||
# Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
|
||||
# Returns the id of the file or folder from its name as a string
|
||||
# """
|
||||
# # cache logic
|
||||
# if use_cache:
|
||||
# self.api_cache = getattr(self, "api_cache", {})
|
||||
# cache_key = f"{parent_id}_{name}_{use_mime_type}"
|
||||
# if cache_key in self.api_cache:
|
||||
# logger.debug(f"cache hit for {cache_key=}")
|
||||
# return self.api_cache[cache_key]
|
||||
|
||||
# API logic
|
||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||
query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
|
||||
if use_mime_type:
|
||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||
# # API logic
|
||||
# debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||
# query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
|
||||
# if use_mime_type:
|
||||
# query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||
|
||||
for attempt in range(retries):
|
||||
results = self.service.files().list(
|
||||
q=query_string,
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
# for attempt in range(retries):
|
||||
# results = self.service.files().list(
|
||||
# q=query_string,
|
||||
# spaces='drive', # ie not appDataFolder or photos
|
||||
# fields='files(id, name)'
|
||||
# ).execute()
|
||||
# items = results.get('files', [])
|
||||
|
||||
if len(items) > 0:
|
||||
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
||||
_id = items[-1]['id']
|
||||
if use_cache: self.api_cache[cache_key] = _id
|
||||
return _id
|
||||
else:
|
||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
|
||||
if attempt < retries - 1:
|
||||
logger.debug(f'sleeping for {sleep_seconds} second(s)')
|
||||
time.sleep(sleep_seconds)
|
||||
# if len(items) > 0:
|
||||
# logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
||||
# _id = items[-1]['id']
|
||||
# if use_cache: self.api_cache[cache_key] = _id
|
||||
# return _id
|
||||
# else:
|
||||
# logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
|
||||
# if attempt < retries - 1:
|
||||
# logger.debug(f'sleeping for {sleep_seconds} second(s)')
|
||||
# time.sleep(sleep_seconds)
|
||||
|
||||
if raise_on_missing:
|
||||
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
||||
return None
|
||||
# if raise_on_missing:
|
||||
# raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
||||
# return None
|
||||
|
||||
def _mkdir(self, name: str, parent_id: str):
|
||||
"""
|
||||
Creates a new GDrive folder @name inside folder @parent_id
|
||||
Returns id of the created folder
|
||||
"""
|
||||
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
|
||||
file_metadata = {
|
||||
'name': [name],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [parent_id]
|
||||
}
|
||||
gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
return gd_folder.get('id')
|
||||
# def _mkdir(self, name: str, parent_id: str):
|
||||
# """
|
||||
# Creates a new GDrive folder @name inside folder @parent_id
|
||||
# Returns id of the created folder
|
||||
# """
|
||||
# logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
|
||||
# file_metadata = {
|
||||
# 'name': [name],
|
||||
# 'mimeType': 'application/vnd.google-apps.folder',
|
||||
# 'parents': [parent_id]
|
||||
# }
|
||||
# gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
# return gd_folder.get('id')
|
||||
|
|
|
@ -8,10 +8,10 @@ from slugify import slugify
|
|||
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..storages import StorageV2
|
||||
from ..storages import Storage
|
||||
|
||||
|
||||
class LocalStorageV2(StorageV2):
|
||||
class LocalStorage(Storage):
|
||||
name = "local_storage"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
|
|
|
@ -4,12 +4,12 @@ import boto3, uuid, os, mimetypes
|
|||
from botocore.errorfactory import ClientError
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..storages import StorageV2
|
||||
from ..storages import Storage
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
|
||||
class S3StorageV2(StorageV2):
|
||||
class S3Storage(Storage):
|
||||
name = "s3_storage"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
|
|
|
@ -1,80 +0,0 @@
|
|||
import uuid, os, mimetypes
|
||||
from dataclasses import dataclass
|
||||
|
||||
import boto3
|
||||
from botocore.errorfactory import ClientError
|
||||
|
||||
from .base_storage import Storage
|
||||
from dataclasses import dataclass
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@dataclass
|
||||
class S3Config:
|
||||
bucket: str
|
||||
region: str
|
||||
key: str
|
||||
secret: str
|
||||
folder: str = ""
|
||||
endpoint_url: str = "https://{region}.digitaloceanspaces.com"
|
||||
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
private: bool = False
|
||||
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
|
||||
|
||||
|
||||
class S3Storage(Storage):
|
||||
|
||||
def __init__(self, config: S3Config):
|
||||
self.bucket = config.bucket
|
||||
self.region = config.region
|
||||
self.folder = config.folder
|
||||
self.private = config.private
|
||||
self.cdn_url = config.cdn_url
|
||||
self.key_path = config.key_path
|
||||
self.key_dict = {}
|
||||
|
||||
self.s3 = boto3.client(
|
||||
's3',
|
||||
region_name=config.region,
|
||||
endpoint_url=config.endpoint_url.format(region=config.region),
|
||||
aws_access_key_id=config.key,
|
||||
aws_secret_access_key=config.secret
|
||||
)
|
||||
|
||||
def _get_path(self, key):
|
||||
"""
|
||||
Depends on the self.key_path configuration:
|
||||
* random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID
|
||||
* default -> defaults to self.folder/key
|
||||
"""
|
||||
# defaults to /key
|
||||
final_key = key
|
||||
if self.key_path == "random":
|
||||
if key not in self.key_dict:
|
||||
ext = os.path.splitext(key)[1]
|
||||
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
|
||||
final_key = self.key_dict[key]
|
||||
return os.path.join(self.folder, final_key)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
|
||||
|
||||
def exists(self, key):
|
||||
try:
|
||||
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
|
||||
return True
|
||||
except ClientError:
|
||||
return False
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
|
||||
if 'ContentType' not in extra_args:
|
||||
try:
|
||||
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to get mimetype for {key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
|
@ -10,7 +10,7 @@ from slugify import slugify
|
|||
|
||||
|
||||
@dataclass
|
||||
class StorageV2(Step):
|
||||
class Storage(Step):
|
||||
name = "storage"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
|
@ -18,8 +18,8 @@ class StorageV2(Step):
|
|||
super().__init__(config)
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> StorageV2:
|
||||
return Step.init(name, config, StorageV2)
|
||||
def init(name: str, config: dict) -> Storage:
|
||||
return Step.init(name, config, Storage)
|
||||
|
||||
def store(self, media: Media, item: Metadata) -> None:
|
||||
self.set_key(media, item)
|
||||
|
|
Ładowanie…
Reference in New Issue