diff --git a/README.md b/README.md index e95949e..c52b077 100644 --- a/README.md +++ b/README.md @@ -101,24 +101,18 @@ graph TD A(BaseStorage) -->|parent of| C(GoogleDriveStorage) ``` -## Saving into Folders +## Saving into Subfolders -To use a column from the spreadsheet called `File Number` eg SM001234 as a directory on the cloud storage, you need to pass in - -```bash -python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory -``` +You can have a column in the spreadsheet for the argument `--col-subfolder` that is passed to the storage and can specify a subfolder to put the archived link into. ## Google Drive To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` ```bash -python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory --storage='gd' +python auto_archive.py --sheet 'Sheet Name' --storage='gd' ``` -Note the you must use filenumber for Google Drive Storage. - ## Telethon (Telegrams API Library) Put your `anon.session` in the root, so that it doesn't stall and ask for authentication diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 367b483..6e11957 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -18,6 +18,7 @@ from selenium.webdriver.common.by import By from loguru import logger from selenium.common.exceptions import TimeoutException + @dataclass class ArchiveResult: status: str @@ -42,7 +43,7 @@ class Archiver(ABC): return self.__class__.__name__ @abstractmethod - def download(self, url, check_if_exists=False, filenumber=None): pass + def download(self, url, check_if_exists=False): pass def get_netloc(self, url): return urlparse(url).netloc @@ -51,7 +52,7 @@ class Archiver(ABC): return self.get_key(urlparse(url).path.replace("/", "_") + ".html") # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html - def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None): + def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): page = f'''{url}

Archived media from {self.name}

@@ -71,10 +72,6 @@ class Archiver(ABC): page_hash = self.get_hash(page_filename) - if filenumber != None: - logger.trace(f'filenumber for directory is {filenumber}') - page_key = filenumber + "/" + page_key - self.storage.upload(page_filename, page_key, extra_args={ 'ACL': 'public-read', 'ContentType': 'text/html'}) @@ -82,7 +79,7 @@ class Archiver(ABC): return (page_cdn, page_hash, thumbnail) # eg images in a tweet save to cloud storage - def generate_media_page(self, urls, url, object, filenumber=None): + def generate_media_page(self, urls, url, object): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } @@ -102,10 +99,6 @@ class Archiver(ABC): with open(filename, 'wb') as f: f.write(d.content) - if filenumber is not None: - logger.debug(f'filenumber for directory is {filenumber}') - key = filenumber + "/" + key - # eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg' # eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg' # or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg' @@ -120,7 +113,7 @@ class Archiver(ABC): thumbnail = cdn_url uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber) + return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail) def get_key(self, filename): """ @@ -140,16 +133,15 @@ class Archiver(ABC): def get_hash(self, filename): f = open(filename, "rb") bytes = f.read() # read entire file as bytes - + + # TODO: customizable hash hash = hashlib.sha256(bytes) # option to use SHA3_512 instead # hash = hashlib.sha3_512(bytes) f.close() return hash.hexdigest() - # eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png - # def get_screenshot(self, url, filenumber, storage="GD"): - def get_screenshot(self, url, filenumber): + def get_screenshot(self, url): key = self.get_key(urlparse(url).path.replace( "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") filename = 'tmp/' + key @@ -158,8 +150,8 @@ class Archiver(ABC): if 'facebook.com' in url: try: logger.debug(f'Trying fb click accept cookie popup for {url}') - self.driver.get("http://www.facebook.com") - foo = self.driver.find_element(By.XPATH,"//button[@data-cookiebanner='accept_only_essential_button']") + self.driver.get("http://www.facebook.com") + foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']") foo.click() logger.debug(f'fb click worked') # linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page @@ -174,11 +166,6 @@ class Archiver(ABC): logger.info("TimeoutException loading page for screenshot") self.driver.save_screenshot(filename) - - if filenumber is not None: - logger.debug(f'filenumber for directory is {filenumber}') - key = filenumber + "/" + key - self.storage.upload(filename, key, extra_args={ 'ACL': 'public-read', 'ContentType': 'image/png'}) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index b19ab8f..5a7f63c 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult class TelegramArchiver(Archiver): name = "telegram" - def download(self, url, check_if_exists=False, filenumber=None): + def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle if 't.me' != self.get_netloc(url): return False @@ -27,7 +27,7 @@ class TelegramArchiver(Archiver): if url[-8:] != "?embed=1": url += "?embed=1" - screenshot = self.get_screenshot(url, filenumber=filenumber) + screenshot = self.get_screenshot(url) t = requests.get(url, headers=headers) s = BeautifulSoup(t.content, 'html.parser') @@ -42,7 +42,7 @@ class TelegramArchiver(Archiver): urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])] images += urls - page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber) + page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content))) time_elements = s.find_all('time') timestamp = time_elements[0].get('datetime') if len(time_elements) else None @@ -52,9 +52,6 @@ class TelegramArchiver(Archiver): video_id = video_url.split('/')[-1].split('?')[0] key = self.get_key(video_id) - if filenumber is not None: - key = filenumber + "/" + key - filename = 'tmp/' + key cdn_url = self.storage.get_cdn_url(key) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 5cee791..9e92383 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -7,6 +7,7 @@ from loguru import logger from storages import Storage from .base_archiver import Archiver, ArchiveResult from telethon.sync import TelegramClient +from telethon.errors import ChannelInvalidError @dataclass @@ -41,14 +42,14 @@ class TelethonArchiver(Archiver): media.append(post) return media - def download(self, url, check_if_exists=False, filenumber=None): + def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle matches = self.link_pattern.findall(url) if not len(matches): return False status = "success" - screenshot = self.get_screenshot(url, filenumber) + screenshot = self.get_screenshot(url) # app will ask (stall for user input!) for phone number and auth code if anon.session not found with self.client.start(): @@ -60,7 +61,11 @@ class TelethonArchiver(Archiver): try: post = self.client.get_messages(chat, ids=post_id) except ValueError as e: - logger.warning(f'Could not fetch telegram {url} possibly it\'s private: {e}') + logger.error(f'Could not fetch telegram {url} possibly it\'s private: {e}') + return False + except ChannelInvalidError as e: + # TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819 + logger.error(f'Could not fetch telegram {url} possibly it\'s private or not displayable in : {e}') return False media_posts = self._get_media_posts_in_group(chat, post) @@ -68,11 +73,8 @@ class TelethonArchiver(Archiver): if len(media_posts) > 1: key = self.get_html_key(url) - if filenumber is not None: - key = filenumber + "/" + key - if check_if_exists and self.storage.exists(key): - # only s3 storage supports storage.exists as not implemented on gd + # only s3 storage supports storage.exists as not implemented on gd cdn_url = self.storage.get_cdn_url(key) status = 'already archived' return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) @@ -84,26 +86,19 @@ class TelethonArchiver(Archiver): if len(mp.message) > len(message): message = mp.message filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') key = filename.split('tmp/')[1] - - if filenumber is not None: - key = filenumber + "/" + key self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) os.remove(filename) - page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)), filenumber=filenumber) + page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) elif len(media_posts) == 1: key = self.get_key(f'{chat}_{post_id}') filename = self.client.download_media(post.media, f'tmp/{key}') key = filename.split('tmp/')[1].replace(" ", "") - - if filenumber is not None: - key = filenumber + "/" + key - self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) @@ -112,5 +107,5 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) - page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)), filenumber=filenumber) + page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 9b90efa..47b8374 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult class TiktokArchiver(Archiver): name = "tiktok" - def download(self, url, check_if_exists=False, filenumber=None): + def download(self, url, check_if_exists=False): if 'tiktok.com' not in url: return False diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 05e7ec0..04ed578 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -8,15 +8,15 @@ from .base_archiver import Archiver, ArchiveResult class TwitterArchiver(Archiver): name = "twitter" - def download(self, url, check_if_exists=False, filenumber=None): - + def download(self, url, check_if_exists=False): + if 'twitter.com' != self.get_netloc(url): return False tweet_id = urlparse(url).path.split('/') if 'status' in tweet_id: i = tweet_id.index('status') - tweet_id = tweet_id[i+1] + tweet_id = tweet_id[i + 1] else: return False @@ -25,9 +25,7 @@ class TwitterArchiver(Archiver): try: tweet = next(scr.get_items()) except Exception as ex: - template = "TwitterArchiver cant get tweet and threw, which can happen if a media sensitive tweet. \n type: {0} occurred. \n arguments:{1!r}" - message = template.format(type(ex).__name__, ex.args) - logger.warning(message) + logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") return False if tweet.media is None: @@ -48,8 +46,8 @@ class TwitterArchiver(Archiver): else: logger.warning(f"Could not get media URL of {media}") - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber) + page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) - screenshot = self.get_screenshot(url, filenumber) + screenshot = self.get_screenshot(url) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 652798a..d8479f1 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -14,7 +14,7 @@ class WaybackArchiver(Archiver): super(WaybackArchiver, self).__init__(storage, driver) self.seen_urls = {} - def download(self, url, check_if_exists=False, filenumber=None): + def download(self, url, check_if_exists=False): if check_if_exists and url in self.seen_urls: return self.seen_urls[url] @@ -75,7 +75,7 @@ class WaybackArchiver(Archiver): except: title = "Could not get title" - screenshot = self.get_screenshot(url, filenumber) + screenshot = self.get_screenshot(url) result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot) self.seen_urls[url] = result return result diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 9983950..a6ea615 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -7,6 +7,7 @@ from loguru import logger from .base_archiver import Archiver, ArchiveResult from storages import Storage + class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} @@ -15,7 +16,7 @@ class YoutubeDLArchiver(Archiver): super().__init__(storage, driver) self.fb_cookie = fb_cookie - def download(self, url, check_if_exists=False, filenumber=None): + def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) if netloc in ['facebook.com', 'www.facebook.com']: logger.debug('Using Facebook cookie') @@ -61,9 +62,6 @@ class YoutubeDLArchiver(Archiver): key = self.get_key(filename) - if filenumber is not None: - key = filenumber + "/" + key - if self.storage.exists(key): status = 'already archived' cdn_url = self.storage.get_cdn_url(key) @@ -87,10 +85,6 @@ class YoutubeDLArchiver(Archiver): if status != 'already archived': key = self.get_key(filename) - - if filenumber is not None: - key = filenumber + "/" + key - self.storage.upload(filename, key) # filename ='tmp/sDE-qZdi8p8.webm' @@ -98,8 +92,7 @@ class YoutubeDLArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) hash = self.get_hash(filename) - screenshot = self.get_screenshot(url, filenumber) - + screenshot = self.get_screenshot(url) # get duration duration = info.get('duration') @@ -115,9 +108,9 @@ class YoutubeDLArchiver(Archiver): timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \ if 'timestamp' in info else \ - datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \ + datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \ if 'upload_date' in info and info['upload_date'] is not None else \ - None + None return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) diff --git a/auto_archive.py b/auto_archive.py index a3c17d1..8044e06 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -23,6 +23,7 @@ logger.add("logs/5error.log", level="ERROR") load_dotenv() + def update_sheet(gw, row, result: archivers.ArchiveResult): cell_updates = [] row_values = gw.get_row(row) @@ -68,7 +69,7 @@ def expand_url(url): return url -def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES): +def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES): gc = gspread.service_account(filename='service_account.json') sh = gc.open(sheet) @@ -86,8 +87,6 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW api_hash=os.getenv('TELEGRAM_API_HASH') ) - - # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}') @@ -120,17 +119,8 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) - - if usefilenumber: - filenumber = gw.get_cell(row, 'filenumber') - logger.debug(f'filenumber is {filenumber}') - if filenumber == "": - logger.warning(f'Logic error on row {row} with url {url} - the feature flag for usefilenumber is True, yet cant find a corresponding filenumber') - gw.set_cell(row, 'status', 'Missing filenumber') - continue - else: - # We will use this through the app to differentiate between where to save - filenumber = None + + subfolder = gw.get_cell_or_default(row, 'subfolder') # make a new driver so each spreadsheet row is idempotent options = webdriver.FirefoxOptions() @@ -142,7 +132,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW # in seconds, telegram screenshots catch which don't come back driver.set_page_load_timeout(120) - # client + # client storage_client = None if storage == "s3": storage_client = s3_client @@ -150,6 +140,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW storage_client = gd_client else: raise ValueError(f'Cant get storage_client {storage_client}') + storage_client.update_properties(subfolder=subfolder) # order matters, first to succeed excludes remaining active_archivers = [ @@ -164,12 +155,12 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW logger.debug(f'Trying {archiver} on row {row}') try: - if usefilenumber: - # using filenumber to store in folders so not checking for existence of that url - result = archiver.download(url, check_if_exists=False, filenumber=filenumber) - else: - result = archiver.download(url, check_if_exists=True) - + result = archiver.download(url, check_if_exists=True) + except KeyboardInterrupt: + logger.warning("caught interrupt") + gw.set_cell(row, 'status', '') + driver.quit() + exit() except Exception as e: result = False logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}') @@ -180,9 +171,9 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW result.status = archiver.name + \ ": " + str(result.status) logger.success( - f'{archiver} succeeded on row {row}, url {url}') + f'{archiver} succeeded on row {row}, url {url}') break - + # wayback has seen this url before so keep existing status if "wayback: Internet Archive fallback" in result.status: logger.success( @@ -203,6 +194,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'Finshed worksheet {wks.title}') + @logger.catch def main(): logger.debug(f'Passed args:{sys.argv}') @@ -213,27 +205,21 @@ def main(): parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row') parser.add_argument('--private', action='store_true', help='Store content without public access permission') - parser.add_argument('--use-filenumber-as-directory', action=argparse.BooleanOptionalAction, dest='usefilenumber', \ - help='Will save files into a subfolder on cloud storage which has the File Number eg SM3012') - parser.add_argument('--storage', action='store', dest='storage', default='s3', \ - help='s3 or gd storage. Default is s3. NOTE GD storage supports only using filenumber') + parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"}) for k, v in GWorksheet.COLUMN_NAMES.items(): - parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})') + help = f"the name of the column to fill with {k} (defaults={v})" + if k == "subfolder": + help = f"the name of the column to read the {k} from (defaults={v})" + parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help) args = parser.parse_args() config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()} - logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber} and storage {args.storage}') - - # https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse - # args.filenumber is True (of type bool) when set or None when argument is not there - usefilenumber = False - if args.usefilenumber: - usefilenumber = True + logger.info(f'Opening document {args.sheet} for header {args.header} and storage {args.storage}') mkdir_if_not_exists('tmp') - process_sheet(args.sheet, usefilenumber, args.storage, args.header, config_columns) + process_sheet(args.sheet, args.storage, args.header, config_columns) shutil.rmtree('tmp') diff --git a/storages/base_storage.py b/storages/base_storage.py index e1bf9c7..108e05f 100644 --- a/storages/base_storage.py +++ b/storages/base_storage.py @@ -1,5 +1,6 @@ from loguru import logger from abc import ABC, abstractmethod +from pathlib import Path class Storage(ABC): @@ -19,3 +20,25 @@ class Storage(ABC): logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') with open(filename, 'rb') as f: self.uploadf(f, key, **kwargs) + + def update_properties(self, **kwargs): + """ + method used to update general properties that some children may use + and others not, but that all can call + """ + for k, v in kwargs.items(): + if k in self.get_allowed_properties(): + setattr(self, k, v) + else: + logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"') + + def get_allowed_properties(self): + """ + child classes should specify which properties they allow to be set + """ + return set(["subfolder"]) + + def clean_path(self, folder, default="", add_forward_slash=True): + if folder is None or type(folder) != str or len(folder.strip()) == 0: + return default + return str(Path(folder)) + ("/" if add_forward_slash else "") diff --git a/storages/gd_storage.py b/storages/gd_storage.py index 0e21dfa..3d65519 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -15,6 +15,7 @@ class GDConfig: class GDStorage(Storage): + DEFAULT_UPLOAD_FOLDER_NAME = "default" def __init__(self, config: GDConfig): self.root_folder_id = config.root_folder_id @@ -22,19 +23,14 @@ class GDStorage(Storage): creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES) self.service = build('drive', 'v3', credentials=creds) - def _get_path(self, key): - return self.folder + key - def get_cdn_url(self, key): - # only support files saved in a folders for GD - # S3 supports folder and all stored in the root - - # key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg - foldername = key.split('/', 1)[0] - # eg twitter__media_asdf.jpg - filename = key.split('/', 1)[1] - - logger.debug(f'Looking for {foldername} and filename: {filename} on GD') + """ + only support files saved in a folder for GD + S3 supports folder and all stored in the root + """ + self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False) + filename = key + logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD') # retry policy on Google Drive try_again = True @@ -42,11 +38,11 @@ class GDStorage(Storage): folder_id = None while try_again: # need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url - results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \ - and name = '{foldername}' ", - spaces='drive', # ie not appDataFolder or photos - fields='files(id, name)' - ).execute() + results = self.service.files().list( + q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ", + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() items = results.get('files', []) for item in items: @@ -55,11 +51,11 @@ class GDStorage(Storage): try_again = False if folder_id is None: - logger.debug(f'Cant find {foldername=} waiting and trying again {counter=}') + logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}') counter += 1 time.sleep(10) if counter > 18: - raise ValueError(f'Cant find {foldername} and retried 18 times pausing 10seconds at a time which is 3 minutes') + raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes') # check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html' # happens doing thumbnails @@ -71,12 +67,11 @@ class GDStorage(Storage): logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}') # get id of the sub folder - results = self.service.files().list(q=f"'{folder_id}' in parents \ - and mimeType='application/vnd.google-apps.folder' \ - and name = '{a}' ", - spaces='drive', # ie not appDataFolder or photos - fields='files(id, name)' - ).execute() + results = self.service.files().list( + q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ", + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() items = results.get('files', []) filename = None @@ -87,11 +82,11 @@ class GDStorage(Storage): raise ValueError(f'Problem finding sub folder {a}') # get id of file inside folder (or sub folder) - results = self.service.files().list(q=f"'{folder_id}' in parents \ - and name = '{filename}' ", - spaces='drive', - fields='files(id, name)' - ).execute() + results = self.service.files().list( + q=f"'{folder_id}' in parents and name = '{filename}' ", + spaces='drive', + fields='files(id, name)' + ).execute() items = results.get('files', []) file_id = None @@ -110,41 +105,36 @@ class GDStorage(Storage): return False def uploadf(self, file, key, **_kwargs): - # split on first occurance of / - # eg SM0005 - foldername = key.split('/', 1)[0] - # eg twitter__media_asdf.jpg - filename = key.split('/', 1)[1] - + logger.debug(f"before {self.subfolder=}") + self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False) + filename = key + logger.debug(f"after {self.subfolder=}") # does folder eg SM0005 exist already inside parent of Files auto-archiver - results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \ - and mimeType='application/vnd.google-apps.folder' \ - and name = '{foldername}' ", - spaces='drive', - fields='files(id, name)' - ).execute() + results = self.service.files().list( + q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ", + spaces='drive', + fields='files(id, name)' + ).execute() items = results.get('files', []) folder_id_to_upload_to = None if len(items) > 1: - logger.error(f'Duplicate folder name of {foldername} which should never happen, but continuing anyway') + logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway') for item in items: logger.debug(f"Found existing folder of {item['name']}") folder_id_to_upload_to = item['id'] if folder_id_to_upload_to is None: - logger.debug(f'Creating new folder {foldername}') + logger.debug(f'Creating new folder {self.subfolder}') file_metadata = { - 'name': [foldername], + 'name': [self.subfolder], 'mimeType': 'application/vnd.google-apps.folder', 'parents': [self.root_folder_id] } gd_file = self.service.files().create(body=file_metadata, fields='id').execute() folder_id_to_upload_to = gd_file.get('id') - # check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg' - # happens doing thumbnails - + # check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails # will always return a and a blank b even if there is nothing to split # https://stackoverflow.com/a/38149500/26086 a, _, b = filename.partition('/') @@ -155,12 +145,11 @@ class GDStorage(Storage): logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}') # does the 'a' folder exist already in folder_id_to_upload_to eg SM0005 - results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \ - and mimeType='application/vnd.google-apps.folder' \ - and name = '{a}' ", - spaces='drive', # ie not appDataFolder or photos - fields='files(id, name)' - ).execute() + results = self.service.files().list( + q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ", + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() items = results.get('files', []) sub_folder_id_to_upload_to = None if len(items) > 1: @@ -184,17 +173,13 @@ class GDStorage(Storage): folder_id_to_upload_to = sub_folder_id_to_upload_to # back to normal control flow - # else: # upload file to gd file_metadata = { - # 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg', 'name': [filename], 'parents': [folder_id_to_upload_to] } media = MediaFileUpload(file, resumable=True) - gd_file = self.service.files().create(body=file_metadata, - media_body=media, - fields='id').execute() + gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() def upload(self, filename: str, key: str, **kwargs): # GD only requires the filename not a file reader diff --git a/storages/s3_storage.py b/storages/s3_storage.py index d7c9644..fd127e2 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -2,6 +2,7 @@ import boto3 from botocore.errorfactory import ClientError from .base_storage import Storage from dataclasses import dataclass +from loguru import logger @dataclass @@ -19,12 +20,9 @@ class S3Storage(Storage): def __init__(self, config: S3Config): self.bucket = config.bucket self.region = config.region - self.folder = config.folder + self.folder = self.clean_path(config.folder) self.private = config.private - if len(self.folder) and self.folder[-1] != '/': - self.folder += '/' - self.s3 = boto3.client( 's3', region_name=self.region, @@ -34,7 +32,7 @@ class S3Storage(Storage): ) def _get_path(self, key): - return self.folder + key + return self.folder + self.clean_path(self.subfolder) + key def get_cdn_url(self, key): return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}' @@ -47,9 +45,9 @@ class S3Storage(Storage): return False def uploadf(self, file, key, **kwargs): + logger.debug(f'[S3 storage] uploading {file=}, {key=}') if self.private: extra_args = kwargs.get("extra_args", {}) else: extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) - self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 42afe04..403c453 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -9,8 +9,8 @@ class GWorksheet: eg: if header=4, row 5 will be the first with data. """ COLUMN_NAMES = { - 'filenumber': 'file number', 'url': 'link', + 'subfolder': 'sub folder', 'archive': 'archive location', 'date': 'archive date', 'status': 'archive status', @@ -69,6 +69,15 @@ class GWorksheet: return '' return row[col_index] + def get_cell_or_default(self, row, col: str, default: str = None, fresh=False): + """ + return self.get_cell or default value on error (eg: column is missing) + """ + try: + return self.get_cell(row, col, fresh) + except: + return default + def set_cell(self, row: int, col: str, val): # row is 1-based col_index = self._col_index(col) + 1