diff --git a/.example.env b/.example.env index 4a200cf..94d25af 100644 --- a/.example.env +++ b/.example.env @@ -7,4 +7,12 @@ INTERNET_ARCHIVE_S3_SECRET= TELEGRAM_API_ID= TELEGRAM_API_HASH= -FACEBOOK_COOKIE=cookie: datr= xxxx \ No newline at end of file +FACEBOOK_COOKIE=cookie: datr= xxxx + +# Google Drive, Right click on folder, Get link, eg +# https://drive.google.com/drive/folders/1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X?usp=sharing +# we want: 1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X +GD_ROOT_FOLDER_ID= + +# Remeber to share the folder with the service eg +# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com diff --git a/.gitignore b/.gitignore index 9d83858..f76014b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ __pycache__/ anu.html *.log .pytest_cach -anon* \ No newline at end of file + +anon* diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..1a82e0c --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,27 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Test Hashing", + "type": "python", + "request": "launch", + "program": "auto_archive.py", + "console": "integratedTerminal", + "justMyCode": true, + // "args": ["--sheet","Test Hashing"] + // "args": ["--sheet","Test Hashing","--use-filenumber-as-directory"] + "args": ["--sheet","Test Hashing","--use-filenumber-as-directory", "--storage=gd"] + }, + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 5854f68..0f1b2e1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # auto-archiver -This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis. +This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis. ## Setup @@ -14,7 +14,7 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. -A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables: +A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables: ``` DO_SPACES_REGION= @@ -23,8 +23,14 @@ DO_SPACES_KEY= DO_SPACES_SECRET= INTERNET_ARCHIVE_S3_KEY= INTERNET_ARCHIVE_S3_SECRET= +TELEGRAM_API_ID= +TELEGRAM_API_HASH= +FACEBOOK_COOKIE= +GD_ROOT_FOLDER_ID= ``` +`.example.env` is an example of this file + Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. ## Running @@ -93,3 +99,29 @@ graph TD graph TD A(BaseStorage) -->|parent of| B(S3Storage) ``` + +## Saving into Folders + +To use a column from the spreadsheet called `File Number` eg SM001234 as a directory on the cloud storage, you need to pass in + +```bash +python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory +``` + +## Google Drive + +To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` + +```bash +python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory --storage='gd' +``` + +Note the you must use filenumber for Google Drive Storage. + +## Telethon (Telegrams API Library) + +Put your `anon.session` in the root, so that it doesn't stall and ask for authentication + + + + diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 7ab5a9c..367b483 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -14,6 +14,9 @@ from selenium.common.exceptions import TimeoutException from storages import Storage from utils import mkdir_if_not_exists +from selenium.webdriver.common.by import By +from loguru import logger +from selenium.common.exceptions import TimeoutException @dataclass class ArchiveResult: @@ -39,7 +42,7 @@ class Archiver(ABC): return self.__class__.__name__ @abstractmethod - def download(self, url, check_if_exists=False): pass + def download(self, url, check_if_exists=False, filenumber=None): pass def get_netloc(self, url): return urlparse(url).netloc @@ -47,7 +50,8 @@ class Archiver(ABC): def get_html_key(self, url): return self.get_key(urlparse(url).path.replace("/", "_") + ".html") - def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): + # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html + def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None): page = f'''{url}

Archived media from {self.name}

@@ -61,18 +65,24 @@ class Archiver(ABC): page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") page_filename = 'tmp/' + page_key - page_cdn = self.storage.get_cdn_url(page_key) with open(page_filename, "w") as f: f.write(page) page_hash = self.get_hash(page_filename) + if filenumber != None: + logger.trace(f'filenumber for directory is {filenumber}') + page_key = filenumber + "/" + page_key + self.storage.upload(page_filename, page_key, extra_args={ 'ACL': 'public-read', 'ContentType': 'text/html'}) + + page_cdn = self.storage.get_cdn_url(page_key) return (page_cdn, page_hash, thumbnail) - def generate_media_page(self, urls, url, object): + # eg images in a tweet save to cloud storage + def generate_media_page(self, urls, url, object, filenumber=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } @@ -87,19 +97,30 @@ class Archiver(ABC): filename = 'tmp/' + key + # eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig d = requests.get(media_url, headers=headers) with open(filename, 'wb') as f: f.write(d.content) + if filenumber is not None: + logger.debug(f'filenumber for directory is {filenumber}') + key = filenumber + "/" + key + + # eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg' + # eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg' + # or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg' self.storage.upload(filename, key) + hash = self.get_hash(filename) + + # eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg' cdn_url = self.storage.get_cdn_url(key) if thumbnail is None: thumbnail = cdn_url uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail) + return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber) def get_key(self, filename): """ @@ -119,15 +140,33 @@ class Archiver(ABC): def get_hash(self, filename): f = open(filename, "rb") bytes = f.read() # read entire file as bytes + hash = hashlib.sha256(bytes) + # option to use SHA3_512 instead + # hash = hashlib.sha3_512(bytes) f.close() return hash.hexdigest() - def get_screenshot(self, url): + # eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png + # def get_screenshot(self, url, filenumber, storage="GD"): + def get_screenshot(self, url, filenumber): key = self.get_key(urlparse(url).path.replace( "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") filename = 'tmp/' + key + # Accept cookies popup dismiss for ytdlp video + if 'facebook.com' in url: + try: + logger.debug(f'Trying fb click accept cookie popup for {url}') + self.driver.get("http://www.facebook.com") + foo = self.driver.find_element(By.XPATH,"//button[@data-cookiebanner='accept_only_essential_button']") + foo.click() + logger.debug(f'fb click worked') + # linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page + time.sleep(2) + except: + logger.warning(f'Failed on fb accept cookies for url {url}') + try: self.driver.get(url) time.sleep(6) @@ -135,8 +174,14 @@ class Archiver(ABC): logger.info("TimeoutException loading page for screenshot") self.driver.save_screenshot(filename) + + if filenumber is not None: + logger.debug(f'filenumber for directory is {filenumber}') + key = filenumber + "/" + key + self.storage.upload(filename, key, extra_args={ 'ACL': 'public-read', 'ContentType': 'image/png'}) + return self.storage.get_cdn_url(key) def get_thumbnails(self, filename, key, duration=None): @@ -167,10 +212,9 @@ class Archiver(ABC): thumbnail_filename = thumbnails_folder + fname key = key_folder + fname - cdn_url = self.storage.get_cdn_url(key) - self.storage.upload(thumbnail_filename, key) + cdn_url = self.storage.get_cdn_url(key) cdn_urls.append(cdn_url) if len(cdn_urls) == 0: diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 5a7f63c..b19ab8f 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult class TelegramArchiver(Archiver): name = "telegram" - def download(self, url, check_if_exists=False): + def download(self, url, check_if_exists=False, filenumber=None): # detect URLs that we definitely cannot handle if 't.me' != self.get_netloc(url): return False @@ -27,7 +27,7 @@ class TelegramArchiver(Archiver): if url[-8:] != "?embed=1": url += "?embed=1" - screenshot = self.get_screenshot(url) + screenshot = self.get_screenshot(url, filenumber=filenumber) t = requests.get(url, headers=headers) s = BeautifulSoup(t.content, 'html.parser') @@ -42,7 +42,7 @@ class TelegramArchiver(Archiver): urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])] images += urls - page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content))) + page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber) time_elements = s.find_all('time') timestamp = time_elements[0].get('datetime') if len(time_elements) else None @@ -52,6 +52,9 @@ class TelegramArchiver(Archiver): video_id = video_url.split('/')[-1].split('?')[0] key = self.get_key(video_id) + if filenumber is not None: + key = filenumber + "/" + key + filename = 'tmp/' + key cdn_url = self.storage.get_cdn_url(key) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 88bec58..5cee791 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -41,20 +41,22 @@ class TelethonArchiver(Archiver): media.append(post) return media - def download(self, url, check_if_exists=False): + def download(self, url, check_if_exists=False, filenumber=None): # detect URLs that we definitely cannot handle matches = self.link_pattern.findall(url) if not len(matches): return False status = "success" - screenshot = self.get_screenshot(url) + screenshot = self.get_screenshot(url, filenumber) + # app will ask (stall for user input!) for phone number and auth code if anon.session not found with self.client.start(): matches = list(matches[0]) chat, post_id = matches[1], matches[2] post_id = int(post_id) + try: post = self.client.get_messages(chat, ids=post_id) except ValueError as e: @@ -65,9 +67,13 @@ class TelethonArchiver(Archiver): if len(media_posts) > 1: key = self.get_html_key(url) - cdn_url = self.storage.get_cdn_url(key) + + if filenumber is not None: + key = filenumber + "/" + key if check_if_exists and self.storage.exists(key): + # only s3 storage supports storage.exists as not implemented on gd + cdn_url = self.storage.get_cdn_url(key) status = 'already archived' return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) @@ -78,19 +84,26 @@ class TelethonArchiver(Archiver): if len(mp.message) > len(message): message = mp.message filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') key = filename.split('tmp/')[1] + + if filenumber is not None: + key = filenumber + "/" + key self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) os.remove(filename) - page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) + page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)), filenumber=filenumber) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) elif len(media_posts) == 1: key = self.get_key(f'{chat}_{post_id}') filename = self.client.download_media(post.media, f'tmp/{key}') key = filename.split('tmp/')[1].replace(" ", "") + + if filenumber is not None: + key = filenumber + "/" + key + self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) @@ -99,5 +112,5 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) - page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) + page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)), filenumber=filenumber) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 6b5116f..9b90efa 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult class TiktokArchiver(Archiver): name = "tiktok" - def download(self, url, check_if_exists=False): + def download(self, url, check_if_exists=False, filenumber=None): if 'tiktok.com' not in url: return False @@ -54,11 +54,13 @@ class TiktokArchiver(Archiver): thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(), hash=hash, screenshot=screenshot) - except tiktok_downloader.Except.InvalidUrl: + except tiktok_downloader.Except.InvalidUrl as e: status = 'Invalid URL' + logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}') return ArchiveResult(status=status) except: error = traceback.format_exc() status = 'Other Tiktok error: ' + str(error) + logger.warning(f'Other Tiktok error' + str(error)) return ArchiveResult(status=status) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 099d279..05e7ec0 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -1,6 +1,5 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from loguru import logger -import requests from urllib.parse import urlparse from .base_archiver import Archiver, ArchiveResult @@ -9,7 +8,8 @@ from .base_archiver import Archiver, ArchiveResult class TwitterArchiver(Archiver): name = "twitter" - def download(self, url, check_if_exists=False): + def download(self, url, check_if_exists=False, filenumber=None): + if 'twitter.com' != self.get_netloc(url): return False @@ -24,11 +24,14 @@ class TwitterArchiver(Archiver): try: tweet = next(scr.get_items()) - except: - logger.warning('wah wah') + except Exception as ex: + template = "TwitterArchiver cant get tweet and threw, which can happen if a media sensitive tweet. \n type: {0} occurred. \n arguments:{1!r}" + message = template.format(type(ex).__name__, ex.args) + logger.warning(message) return False if tweet.media is None: + logger.trace(f'No media found') return False urls = [] @@ -45,8 +48,8 @@ class TwitterArchiver(Archiver): else: logger.warning(f"Could not get media URL of {media}") - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) + page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber) - screenshot = self.get_screenshot(url) + screenshot = self.get_screenshot(url, filenumber) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 1fa98aa..652798a 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -4,6 +4,8 @@ from bs4 import BeautifulSoup from storages import Storage from .base_archiver import Archiver, ArchiveResult +from loguru import logger + class WaybackArchiver(Archiver): name = "wayback" @@ -12,7 +14,7 @@ class WaybackArchiver(Archiver): super(WaybackArchiver, self).__init__(storage, driver) self.seen_urls = {} - def download(self, url, check_if_exists=False): + def download(self, url, check_if_exists=False, filenumber=None): if check_if_exists and url in self.seen_urls: return self.seen_urls[url] @@ -25,9 +27,11 @@ class WaybackArchiver(Archiver): 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) if r.status_code != 200: + logger.warning(f"Internet archive failed with status of {r.status_code}") return ArchiveResult(status="Internet archive failed") if 'job_id' not in r.json() and 'message' in r.json(): + logger.warning(f"Internet archive failed json \n {r.json()}") return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") job_id = r.json()['job_id'] @@ -71,7 +75,7 @@ class WaybackArchiver(Archiver): except: title = "Could not get title" - screenshot = self.get_screenshot(url) + screenshot = self.get_screenshot(url, filenumber) result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot) self.seen_urls[url] = result return result diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index ad8756b..9983950 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -15,7 +15,7 @@ class YoutubeDLArchiver(Archiver): super().__init__(storage, driver) self.fb_cookie = fb_cookie - def download(self, url, check_if_exists=False): + def download(self, url, check_if_exists=False, filenumber=None): netloc = self.get_netloc(url) if netloc in ['facebook.com', 'www.facebook.com']: logger.debug('Using Facebook cookie') @@ -27,13 +27,17 @@ class YoutubeDLArchiver(Archiver): try: info = ydl.extract_info(url, download=False) - except yt_dlp.utils.DownloadError: - # no video here + except yt_dlp.utils.DownloadError as e: + logger.debug(f'No video - Youtube normal control flow: {e}') + return False + except Exception as e: + logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}') return False if info.get('is_live', False): logger.warning("Live streaming media, not archiving now") return ArchiveResult(status="Streaming media") + if 'twitter.com' in netloc: if 'https://twitter.com/' in info['webpage_url']: logger.info('Found https://twitter.com/ in the download url from Twitter') @@ -41,7 +45,6 @@ class YoutubeDLArchiver(Archiver): logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet') return False - if check_if_exists: if 'entries' in info: if len(info['entries']) > 1: @@ -58,6 +61,9 @@ class YoutubeDLArchiver(Archiver): key = self.get_key(filename) + if filenumber is not None: + key = filenumber + "/" + key + if self.storage.exists(key): status = 'already archived' cdn_url = self.storage.get_cdn_url(key) @@ -81,12 +87,19 @@ class YoutubeDLArchiver(Archiver): if status != 'already archived': key = self.get_key(filename) - cdn_url = self.storage.get_cdn_url(key) + + if filenumber is not None: + key = filenumber + "/" + key self.storage.upload(filename, key) + # filename ='tmp/sDE-qZdi8p8.webm' + # key ='SM0022/youtube_dl_sDE-qZdi8p8.webm' + cdn_url = self.storage.get_cdn_url(key) + hash = self.get_hash(filename) - screenshot = self.get_screenshot(url) + screenshot = self.get_screenshot(url, filenumber) + # get duration duration = info.get('duration') diff --git a/auto_archive.py b/auto_archive.py index c0ae085..a3c17d1 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -68,7 +68,7 @@ def expand_url(url): return url -def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): +def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES): gc = gspread.service_account(filename='service_account.json') sh = gc.open(sheet) @@ -78,6 +78,9 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): key=os.getenv('DO_SPACES_KEY'), secret=os.getenv('DO_SPACES_SECRET') ) + gd_config = GDConfig( + root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'), + ) telegram_config = archivers.TelegramConfig( api_id=os.getenv('TELEGRAM_API_ID'), api_hash=os.getenv('TELEGRAM_API_HASH') @@ -91,12 +94,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): gw = GWorksheet(wks, header_row=header, columns=columns) if not gw.col_exists('url'): - logger.warning( + logger.info( f'No "{columns["url"]}" column found, skipping worksheet {wks.title}') continue if not gw.col_exists('status'): - logger.warning( + logger.info( f'No "{columns["status"]}" column found, skipping worksheet {wks.title}') continue @@ -104,26 +107,30 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' s3_client = S3Storage(s3_config) - # order matters, first to succeed excludes remaining - active_archivers = [ - archivers.TelethonArchiver(s3_client, driver, telegram_config), - archivers.TelegramArchiver(s3_client, driver), - archivers.TiktokArchiver(s3_client, driver), - archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')), - archivers.TwitterArchiver(s3_client, driver), - archivers.WaybackArchiver(s3_client, driver) - ] + gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' + gd_client = GDStorage(gd_config) # loop through rows in worksheet for row in range(1 + header, gw.count_rows() + 1): url = gw.get_cell(row, 'url') original_status = gw.get_cell(row, 'status') status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') + if url != '' and status in ['', None]: gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) + if usefilenumber: + filenumber = gw.get_cell(row, 'filenumber') + logger.debug(f'filenumber is {filenumber}') + if filenumber == "": + logger.warning(f'Logic error on row {row} with url {url} - the feature flag for usefilenumber is True, yet cant find a corresponding filenumber') + gw.set_cell(row, 'status', 'Missing filenumber') + continue + else: + # We will use this through the app to differentiate between where to save + filenumber = None # make a new driver so each spreadsheet row is idempotent options = webdriver.FirefoxOptions() @@ -134,24 +141,58 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): driver.set_window_size(1400, 2000) # in seconds, telegram screenshots catch which don't come back driver.set_page_load_timeout(120) + + # client + storage_client = None + if storage == "s3": + storage_client = s3_client + elif storage == "gd": + storage_client = gd_client + else: + raise ValueError(f'Cant get storage_client {storage_client}') + + # order matters, first to succeed excludes remaining + active_archivers = [ + archivers.TelethonArchiver(storage_client, driver, telegram_config), + archivers.TelegramArchiver(storage_client, driver), + archivers.TiktokArchiver(storage_client, driver), + archivers.YoutubeDLArchiver(storage_client, driver, os.getenv('FACEBOOK_COOKIE')), + archivers.TwitterArchiver(storage_client, driver), + archivers.WaybackArchiver(storage_client, driver) + ] for archiver in active_archivers: logger.debug(f'Trying {archiver} on row {row}') try: - result = archiver.download(url, check_if_exists=True) + if usefilenumber: + # using filenumber to store in folders so not checking for existence of that url + result = archiver.download(url, check_if_exists=False, filenumber=filenumber) + else: + result = archiver.download(url, check_if_exists=True) + except Exception as e: result = False logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}') if result: - if result.status in ['success', 'already archived']: + # IA is a Success I believe - or do we want to display a logger warning for it? + if result.status in ['success', 'already archived', 'Internet Archive fallback']: result.status = archiver.name + \ ": " + str(result.status) logger.success( - f'{archiver} succeeded on row {row}') + f'{archiver} succeeded on row {row}, url {url}') break + + # wayback has seen this url before so keep existing status + if "wayback: Internet Archive fallback" in result.status: + logger.success( + f'wayback has seen this url before so keep existing status on row {row}') + result.status = result.status.replace(' (duplicate)', '') + result.status = str(result.status) + " (duplicate)" + break + logger.warning( - f'{archiver} did not succeed on row {row}, final status: {result.status}') + f'{archiver} did not succeed on {row=}, final status: {result.status}') result.status = archiver.name + \ ": " + str(result.status) # get rid of driver so can reload on next row @@ -165,22 +206,34 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): @logger.catch def main(): logger.debug(f'Passed args:{sys.argv}') + parser = argparse.ArgumentParser( description='Automatically archive social media videos from a Google Sheets document') parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True) parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row') parser.add_argument('--private', action='store_true', help='Store content without public access permission') + parser.add_argument('--use-filenumber-as-directory', action=argparse.BooleanOptionalAction, dest='usefilenumber', \ + help='Will save files into a subfolder on cloud storage which has the File Number eg SM3012') + parser.add_argument('--storage', action='store', dest='storage', default='s3', \ + help='s3 or gd storage. Default is s3. NOTE GD storage supports only using filenumber') + for k, v in GWorksheet.COLUMN_NAMES.items(): parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})') args = parser.parse_args() config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()} - logger.info(f'Opening document {args.sheet} for header {args.header}') + logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber} and storage {args.storage}') + + # https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse + # args.filenumber is True (of type bool) when set or None when argument is not there + usefilenumber = False + if args.usefilenumber: + usefilenumber = True mkdir_if_not_exists('tmp') - process_sheet(args.sheet, header=args.header, columns=config_columns) + process_sheet(args.sheet, usefilenumber, args.storage, args.header, config_columns) shutil.rmtree('tmp') diff --git a/storages/base_storage.py b/storages/base_storage.py index e1bf9c7..79a555b 100644 --- a/storages/base_storage.py +++ b/storages/base_storage.py @@ -17,5 +17,12 @@ class Storage(ABC): def upload(self, filename: str, key: str, **kwargs): logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') - with open(filename, 'rb') as f: - self.uploadf(f, key, **kwargs) + # S3 requires an open file, GD only the filename + storage = type(self).__name__ + if storage == "GDStorage": + self.uploadf(filename, key, **kwargs) + elif storage == "S3Storage": + with open(filename, 'rb') as f: + self.uploadf(f, key, **kwargs) + else: + raise ValueError('Cant get storage thrown from base_storage.py') \ No newline at end of file diff --git a/storages/gd_storage.py b/storages/gd_storage.py index e69de29..4dab7d0 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -0,0 +1,202 @@ +from loguru import logger +from .base_storage import Storage +from dataclasses import dataclass + +from googleapiclient.discovery import build +from googleapiclient.http import MediaFileUpload +from google.oauth2 import service_account + +import time + +@dataclass +class GDConfig: + root_folder_id: str + +class GDStorage(Storage): + + def __init__(self, config: GDConfig): + self.root_folder_id = config.root_folder_id + SCOPES = ['https://www.googleapis.com/auth/drive'] + creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES) + self.service = build('drive', 'v3', credentials=creds) + + def _get_path(self, key): + return self.folder + key + + def get_cdn_url(self, key): + # only support files saved in a folders for GD + # S3 supports folder and all stored in the root + + # key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg + foldername = key.split('/', 1)[0] + # eg twitter__media_asdf.jpg + filename = key.split('/', 1)[1] + + logger.debug(f'Looking for {foldername} and filename: {filename} on GD') + + # retry policy on Google Drive + try_again = True + counter = 1 + folder_id = None + while try_again: + # need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url + results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \ + and name = '{foldername}' ", + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() + items = results.get('files', []) + + for item in items: + logger.debug(f"found folder of {item['name']}") + folder_id= item['id'] + try_again = False + + if folder_id is None: + logger.debug(f'Cant find {foldername=} waiting and trying again {counter=}') + counter += 1 + time.sleep(10) + if counter > 18: + raise ValueError(f'Cant find {foldername} and retried 18 times pausing 10seconds at a time which is 3 minutes') + + # check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html' + # happens doing thumbnails + a, _, b = filename.partition('/') + + if b != '': + # a: 'youtube_dl_sDE-qZdi8p8' + # b: 'index.html' + logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}') + + # get id of the sub folder + results = self.service.files().list(q=f"'{folder_id}' in parents \ + and mimeType='application/vnd.google-apps.folder' \ + and name = '{a}' ", + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() + items = results.get('files', []) + + filename = None + for item in items: + folder_id = item['id'] + filename = b + if filename is None: + raise ValueError(f'Problem finding sub folder {a}') + + # get id of file inside folder (or sub folder) + results = self.service.files().list(q=f"'{folder_id}' in parents \ + and name = '{filename}' ", + spaces='drive', + fields='files(id, name)' + ).execute() + items = results.get('files', []) + + file_id = None + for item in items: + logger.debug(f"found file of {item['name']}") + file_id= item['id'] + + if file_id is None: + raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}') + + foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing" + return foo + + def exists(self, key): + # Not implemented yet + # Google drive will accept duplicate named filenames as it is stored as a different fileid + + # try: + # self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key)) + # return True + # except ClientError: + # return False + return False + + def uploadf(self, file, key, **kwargs): + # split on first occurance of / + # eg SM0005 + foldername = key.split('/', 1)[0] + # eg twitter__media_asdf.jpg + filename = key.split('/', 1)[1] + + # does folder eg SM0005 exist already inside parent of Files auto-archiver + results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \ + and mimeType='application/vnd.google-apps.folder' \ + and name = '{foldername}' ", + spaces='drive', + fields='files(id, name)' + ).execute() + items = results.get('files', []) + folder_id_to_upload_to = None + if len(items) > 1: + logger.error(f'Duplicate folder name of {foldername} which should never happen, but continuing anyway') + + for item in items: + logger.debug(f"Found existing folder of {item['name']}") + folder_id_to_upload_to = item['id'] + + if folder_id_to_upload_to is None: + logger.debug(f'Creating new folder {foldername}') + file_metadata = { + 'name': [foldername], + 'mimeType': 'application/vnd.google-apps.folder', + 'parents': [self.root_folder_id] + } + gd_file = self.service.files().create(body=file_metadata, fields='id').execute() + folder_id_to_upload_to = gd_file.get('id') + + # check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg' + # happens doing thumbnails + + # will always return a and a blank b even if there is nothing to split + # https://stackoverflow.com/a/38149500/26086 + a, _, b = filename.partition('/') + + if b != '': + # a: 'youtube_dl_sDE-qZdi8p8' + # b: 'out1.jpg' + logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}') + + # does the 'a' folder exist already in folder_id_to_upload_to eg SM0005 + results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \ + and mimeType='application/vnd.google-apps.folder' \ + and name = '{a}' ", + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() + items = results.get('files', []) + sub_folder_id_to_upload_to = None + if len(items) > 1: + logger.error(f'Duplicate folder name of {a} which should never happen') + + for item in items: + logger.debug(f"Found existing folder of {item['name']}") + sub_folder_id_to_upload_to = item['id'] + + if sub_folder_id_to_upload_to is None: + # create new folder + file_metadata = { + 'name': [a], + 'mimeType': 'application/vnd.google-apps.folder', + 'parents': [folder_id_to_upload_to] + } + gd_file = self.service.files().create(body=file_metadata, fields='id').execute() + sub_folder_id_to_upload_to = gd_file.get('id') + + filename = b + folder_id_to_upload_to = sub_folder_id_to_upload_to + # back to normal control flow + + # else: + # upload file to gd + file_metadata = { + # 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg', + 'name': [filename], + 'parents': [folder_id_to_upload_to] + } + media = MediaFileUpload(file, resumable=True) + gd_file = self.service.files().create(body=file_metadata, + media_body=media, + fields='id').execute() diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 6dec9b2..42afe04 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -9,6 +9,7 @@ class GWorksheet: eg: if header=4, row 5 will be the first with data. """ COLUMN_NAMES = { + 'filenumber': 'file number', 'url': 'link', 'archive': 'archive location', 'date': 'archive date',