Merge branch 'dev' into refactor-configs

2022-06-02 17:30:47 +02:00 · 2022-06-02 17:30:47 +02:00 · 10f03cb888
commit 10f03cb888
--- a/.example.env
+++ b/.example.env
@ -7,4 +7,12 @@ INTERNET_ARCHIVE_S3_SECRET=
 TELEGRAM_API_ID=
 TELEGRAM_API_HASH=

-FACEBOOK_COOKIE=cookie: datr= xxxx
+FACEBOOK_COOKIE=cookie: datr= xxxx
+
+# Google Drive, Right click on folder, Get link: 
+# https://drive.google.com/drive/folders/123456789987654321abcdefghijk?usp=sharing
+# we want:  123456789987654321abcdefghijk
+# Remember to share the folder with the service email
+# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com
+GD_ROOT_FOLDER_ID=
+
--- a/6
+++ b/6
@ -17,8 +17,10 @@ selenium = "*"
 snscrape = "*"
 yt-dlp = "*"
 telethon = "*"
-
-[dev-packages]
+google-api-python-client = "*"
+google-auth-httplib2 = "*"
+google-auth-oauthlib = "*"
+oauth2client = "*"

 [requires]
 python_version = "3.9"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "e27ea0a6fdf6e588c14fbb90af45f784b9e55a9b986a3b50770490648ba96720"
+            "sha256": "25b858227d74cc232bba525d34dcf30f15d18d535a6e9c59555e85a0a2bd8c61"
        },
        "pipfile-spec": 6,
        "requires": {
@ -141,18 +141,19 @@
        },
        "cachetools": {
            "hashes": [
-                "sha256:486471dfa8799eb7ec503a8059e263db000cdda20075ce5e48903087f79d5fd6",
-                "sha256:8fecd4203a38af17928be7b90689d8083603073622229ca7077b72d8e5a976e4"
+                "sha256:4ebbd38701cdfd3603d1f751d851ed248ab4570929f2d8a7ce69e30c420b141c",
+                "sha256:8b3b8fa53f564762e5b221e9896798951e7f915513abf2ba072ce0f07f3f5a98"
            ],
            "markers": "python_version ~= '3.7'",
-            "version": "==5.0.0"
+            "version": "==5.1.0"
        },
        "certifi": {
            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+                "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
+                "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
            ],
-            "version": "==2021.10.8"
+            "markers": "python_version >= '3.6'",
+            "version": "==2022.5.18.1"
        },
        "cffi": {
            "hashes": [
@ -277,11 +278,11 @@
        },
        "filelock": {
            "hashes": [
-                "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
-                "sha256:f8314284bfffbdcfa0ff3d7992b023d4c628ced6feb957351d4c48d059f56bc0"
+                "sha256:b795f1b42a61bbf8ec7113c341dad679d772567b936fbd1bf43c9a238e673e20",
+                "sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6"
            ],
            "markers": "python_version >= '3.7'",
-            "version": "==3.6.0"
+            "version": "==3.7.0"
        },
        "flask": {
            "hashes": [
@ -298,6 +299,22 @@
            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==0.18.2"
        },
+        "google-api-core": {
+            "hashes": [
+                "sha256:065bb8e11c605fd232707ae50963dc1c8af5b3c95b4568887515985e6c1156b3",
+                "sha256:1b9f59236ce1bae9a687c1d4f22957e79a2669e53d032893f6bf0fca54f6931d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.8.0"
+        },
+        "google-api-python-client": {
+            "hashes": [
+                "sha256:4527f7b8518a795624ab68da412d55628f83b98c67dd6a5d6edf725454f8b30b",
+                "sha256:600c43d7eac6e3536fdcad1d14ba9ee503edf4c7db0bd827e791bbf03b9f1330"
+            ],
+            "index": "pypi",
+            "version": "==2.48.0"
+        },
        "google-auth": {
            "hashes": [
                "sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312",
@ -306,14 +323,30 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
            "version": "==2.6.6"
        },
+        "google-auth-httplib2": {
+            "hashes": [
+                "sha256:31e49c36c6b5643b57e82617cb3e021e3e1d2df9da63af67252c02fa9c1f4a10",
+                "sha256:a07c39fd632becacd3f07718dfd6021bf396978f03ad3ce4321d060015cc30ac"
+            ],
+            "index": "pypi",
+            "version": "==0.1.0"
+        },
        "google-auth-oauthlib": {
            "hashes": [
                "sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
                "sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
            ],
-            "markers": "python_version >= '3.6'",
+            "index": "pypi",
            "version": "==0.5.1"
        },
+        "googleapis-common-protos": {
+            "hashes": [
+                "sha256:6b5ee59dc646eb61a8eb65ee1db186d3df6687c8804830024f32573298bca19b",
+                "sha256:ddcd955b5bb6589368f659fa475373faa1ed7d09cde5ba25e88513d87007e174"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.56.1"
+        },
        "gspread": {
            "hashes": [
                "sha256:319766d90db05056293f7ee0ad2b35503a1a40683a75897a2922398cd2016283",
@ -330,6 +363,14 @@
            "markers": "python_version >= '3.6'",
            "version": "==0.13.0"
        },
+        "httplib2": {
+            "hashes": [
+                "sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585",
+                "sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.20.4"
+        },
        "idna": {
            "hashes": [
                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
@ -499,6 +540,14 @@
            "markers": "python_version >= '3.5' and python_version < '4'",
            "version": "==1.45.1"
        },
+        "oauth2client": {
+            "hashes": [
+                "sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac",
+                "sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6"
+            ],
+            "index": "pypi",
+            "version": "==4.1.3"
+        },
        "oauthlib": {
            "hashes": [
                "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
@ -515,6 +564,36 @@
            "markers": "python_version >= '3.6'",
            "version": "==1.1.0"
        },
+        "protobuf": {
+            "hashes": [
+                "sha256:06059eb6953ff01e56a25cd02cca1a9649a75a7e65397b5b9b4e929ed71d10cf",
+                "sha256:097c5d8a9808302fb0da7e20edf0b8d4703274d140fd25c5edabddcde43e081f",
+                "sha256:284f86a6207c897542d7e956eb243a36bb8f9564c1742b253462386e96c6b78f",
+                "sha256:32ca378605b41fd180dfe4e14d3226386d8d1b002ab31c969c366549e66a2bb7",
+                "sha256:3cc797c9d15d7689ed507b165cd05913acb992d78b379f6014e013f9ecb20996",
+                "sha256:62f1b5c4cd6c5402b4e2d63804ba49a327e0c386c99b1675c8a0fefda23b2067",
+                "sha256:69ccfdf3657ba59569c64295b7d51325f91af586f8d5793b734260dfe2e94e2c",
+                "sha256:6f50601512a3d23625d8a85b1638d914a0970f17920ff39cec63aaef80a93fb7",
+                "sha256:7403941f6d0992d40161aa8bb23e12575637008a5a02283a930addc0508982f9",
+                "sha256:755f3aee41354ae395e104d62119cb223339a8f3276a0cd009ffabfcdd46bb0c",
+                "sha256:77053d28427a29987ca9caf7b72ccafee011257561259faba8dd308fda9a8739",
+                "sha256:7e371f10abe57cee5021797126c93479f59fccc9693dafd6bd5633ab67808a91",
+                "sha256:9016d01c91e8e625141d24ec1b20fed584703e527d28512aa8c8707f105a683c",
+                "sha256:9be73ad47579abc26c12024239d3540e6b765182a91dbc88e23658ab71767153",
+                "sha256:adc31566d027f45efe3f44eeb5b1f329da43891634d61c75a5944e9be6dd42c9",
+                "sha256:adfc6cf69c7f8c50fd24c793964eef18f0ac321315439d94945820612849c388",
+                "sha256:af0ebadc74e281a517141daad9d0f2c5d93ab78e9d455113719a45a49da9db4e",
+                "sha256:cb29edb9eab15742d791e1025dd7b6a8f6fcb53802ad2f6e3adcb102051063ab",
+                "sha256:cd68be2559e2a3b84f517fb029ee611546f7812b1fdd0aa2ecc9bc6ec0e4fdde",
+                "sha256:cdee09140e1cd184ba9324ec1df410e7147242b94b5f8b0c64fc89e38a8ba531",
+                "sha256:db977c4ca738dd9ce508557d4fce0f5aebd105e158c725beec86feb1f6bc20d8",
+                "sha256:dd5789b2948ca702c17027c84c2accb552fc30f4622a98ab5c51fcfe8c50d3e7",
+                "sha256:e250a42f15bf9d5b09fe1b293bdba2801cd520a9f5ea2d7fb7536d4441811d20",
+                "sha256:ff8d8fa42675249bb456f5db06c00de6c2f4c27a065955917b28c4f15978b9c3"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==3.20.1"
+        },
        "pyaes": {
            "hashes": [
                "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
@ -749,6 +828,14 @@
            "markers": "python_version >= '3.5'",
            "version": "==0.9.2"
        },
+        "uritemplate": {
+            "hashes": [
+                "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
+                "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==4.1.1"
+        },
        "urllib3": {
            "hashes": [
                "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # auto-archiver

-This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
+This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.

 ## Setup

@ -14,7 +14,7 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta

 [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 

-A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
+A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:

 ```
 DO_SPACES_REGION=
@ -23,8 +23,14 @@ DO_SPACES_KEY=
 DO_SPACES_SECRET=
 INTERNET_ARCHIVE_S3_KEY=
 INTERNET_ARCHIVE_S3_SECRET=
+TELEGRAM_API_ID=
+TELEGRAM_API_HASH=
+FACEBOOK_COOKIE=
+GD_ROOT_FOLDER_ID=
 ```

+`.example.env` is an example of this file
+
 Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.

 ## Running
@ -93,4 +99,25 @@ graph TD
 graph TD
    A(BaseStorage) -->|parent of| B(S3Storage)
    C(BaseStorage) -->|parent of| C(LocalStorage)
+    A(BaseStorage) -->|parent of| C(GoogleDriveStorage)
 ```
+
+## Saving into Subfolders
+
+You can have a column in the spreadsheet for the argument `--col-subfolder` that is passed to the storage and can specify a subfolder to put the archived link into.
+
+## Google Drive
+
+To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
+
+```bash
+python auto_archive.py --sheet 'Sheet Name' --storage='gd'
+```
+
+## Telethon (Telegrams API Library)
+
+Put your `anon.session` in the root, so that it doesn't stall and ask for authentication
+
+
+
+
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@ -14,6 +14,10 @@ from selenium.common.exceptions import TimeoutException
 from storages import Storage
 from utils import mkdir_if_not_exists

+from selenium.webdriver.common.by import By
+from loguru import logger
+from selenium.common.exceptions import TimeoutException
+

@dataclass
 class ArchiveResult:
@ -47,6 +51,7 @@ class Archiver(ABC):
    def get_html_key(self, url):
        return self.get_key(urlparse(url).path.replace("/", "_") + ".html")

+    # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
    def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
        page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
            <body>
@ -70,8 +75,11 @@ class Archiver(ABC):

        self.storage.upload(page_filename, page_key, extra_args={
            'ACL': 'public-read', 'ContentType': 'text/html'})
+
+        page_cdn = self.storage.get_cdn_url(page_key)
        return (page_cdn, page_hash, thumbnail)

+    # eg images in a tweet save to cloud storage
    def generate_media_page(self, urls, url, object):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
@ -87,12 +95,19 @@ class Archiver(ABC):

            filename = Storage.TMP_FOLDER + key

+            # eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
            d = requests.get(media_url, headers=headers)
            with open(filename, 'wb') as f:
                f.write(d.content)

+            # eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
+            # eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
+            # or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
            self.storage.upload(filename, key)
+
            hash = self.get_hash(filename)
+
+            # eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
            cdn_url = self.storage.get_cdn_url(key)

            if thumbnail is None:
@ -119,7 +134,11 @@ class Archiver(ABC):
    def get_hash(self, filename):
        f = open(filename, "rb")
        bytes = f.read()  # read entire file as bytes
+
+        # TODO: customizable hash
        hash = hashlib.sha256(bytes)
+        # option to use SHA3_512 instead
+        # hash = hashlib.sha3_512(bytes)
        f.close()
        return hash.hexdigest()

@ -128,6 +147,19 @@ class Archiver(ABC):
            "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
        filename = Storage.TMP_FOLDER + key

+        # Accept cookies popup dismiss for ytdlp video
+        if 'facebook.com' in url:
+            try:
+                logger.debug(f'Trying fb click accept cookie popup for {url}')
+                self.driver.get("http://www.facebook.com")
+                foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
+                foo.click()
+                logger.debug(f'fb click worked')
+                # linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
+                time.sleep(2)
+            except:
+                logger.warning(f'Failed on fb accept cookies for url {url}')
+
        try:
            self.driver.get(url)
            time.sleep(6)
@ -137,6 +169,7 @@ class Archiver(ABC):
        self.driver.save_screenshot(filename)
        self.storage.upload(filename, key, extra_args={
                            'ACL': 'public-read', 'ContentType': 'image/png'})
+
        return self.storage.get_cdn_url(key)

    def get_thumbnails(self, filename, key, duration=None):
@ -167,10 +200,9 @@ class Archiver(ABC):
                thumbnail_filename = thumbnails_folder + fname
                key = key_folder + fname

-                cdn_url = self.storage.get_cdn_url(key)
-
                self.storage.upload(thumbnail_filename, key)

+                cdn_url = self.storage.get_cdn_url(key)
                cdn_urls.append(cdn_url)

        if len(cdn_urls) == 0:
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@ -6,6 +6,7 @@ from loguru import logger
 from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
 from telethon.sync import TelegramClient
+from telethon.errors import ChannelInvalidError
 from configs import TelegramConfig


@ -43,15 +44,21 @@ class TelethonArchiver(Archiver):

        status = "success"

+        # app will ask (stall for user input!) for phone number and auth code if anon.session not found
        with self.client.start():
            matches = list(matches[0])
            chat, post_id = matches[1], matches[2]

            post_id = int(post_id)
+
            try:
                post = self.client.get_messages(chat, ids=post_id)
            except ValueError as e:
-                logger.warning(f'Could not fetch telegram {url} possibly it\'s private: {e}')
+                logger.error(f'Could not fetch telegram {url} possibly it\'s private: {e}')
+                return False
+            except ChannelInvalidError as e:
+                # TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819
+                logger.error(f'Could not fetch telegram {url} possibly it\'s private or not displayable in : {e}')
                return False

            media_posts = self._get_media_posts_in_group(chat, post)
@ -60,9 +67,10 @@ class TelethonArchiver(Archiver):

            if len(media_posts) > 1:
                key = self.get_html_key(url)
-                cdn_url = self.storage.get_cdn_url(key)

                if check_if_exists and self.storage.exists(key):
+                    # only s3 storage supports storage.exists as not implemented on gd
+                    cdn_url = self.storage.get_cdn_url(key)
                    status = 'already archived'
                    return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)

--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@ -54,11 +54,13 @@ class TiktokArchiver(Archiver):
                                 thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
                                 hash=hash, screenshot=screenshot)

-        except tiktok_downloader.Except.InvalidUrl:
+        except tiktok_downloader.Except.InvalidUrl as e:
            status = 'Invalid URL'
+            logger.warning(f'Invalid URL on {url}  {e}\n{traceback.format_exc()}')
            return ArchiveResult(status=status)

        except:
            error = traceback.format_exc()
            status = 'Other Tiktok error: ' + str(error)
+            logger.warning(f'Other Tiktok error' + str(error))
            return ArchiveResult(status=status)
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@ -1,6 +1,5 @@
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
 from loguru import logger
-import requests
 from urllib.parse import urlparse

 from .base_archiver import Archiver, ArchiveResult
@ -10,13 +9,14 @@ class TwitterArchiver(Archiver):
    name = "twitter"

    def download(self, url, check_if_exists=False):
+
        if 'twitter.com' != self.get_netloc(url):
            return False

        tweet_id = urlparse(url).path.split('/')
        if 'status' in tweet_id:
            i = tweet_id.index('status')
-            tweet_id = tweet_id[i+1]
+            tweet_id = tweet_id[i + 1]
        else:
            return False

@ -24,11 +24,12 @@ class TwitterArchiver(Archiver):

        try:
            tweet = next(scr.get_items())
-        except:
-            logger.warning('wah wah')
+        except Exception as ex:
+            logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
            return False

        if tweet.media is None:
+            logger.trace(f'No media found')
            return False

        urls = []
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@ -5,6 +5,9 @@ from storages import Storage
 from .base_archiver import Archiver, ArchiveResult

 from configs import WaybackConfig
+from loguru import logger
+
+
 class WaybackArchiver(Archiver):
    name = "wayback"

@ -26,9 +29,11 @@ class WaybackArchiver(Archiver):
            'https://web.archive.org/save/', headers=ia_headers, data={'url': url})

        if r.status_code != 200:
+            logger.warning(f"Internet archive failed with status of {r.status_code}")
            return ArchiveResult(status="Internet archive failed")

        if 'job_id' not in r.json() and 'message' in r.json():
+            logger.warning(f"Internet archive failed json \n {r.json()}")
            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")

        job_id = r.json()['job_id']
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@ -7,6 +7,7 @@ from loguru import logger
 from .base_archiver import Archiver, ArchiveResult
 from storages import Storage

+
 class YoutubeDLArchiver(Archiver):
    name = "youtube_dl"
    ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
@ -27,13 +28,17 @@ class YoutubeDLArchiver(Archiver):

        try:
            info = ydl.extract_info(url, download=False)
-        except yt_dlp.utils.DownloadError:
-            # no video here
+        except yt_dlp.utils.DownloadError as e:
+            logger.debug(f'No video - Youtube normal control flow: {e}')
+            return False
+        except Exception as e:
+            logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n  {e}')
            return False

        if info.get('is_live', False):
            logger.warning("Live streaming media, not archiving now")
            return ArchiveResult(status="Streaming media")
+
        if 'twitter.com' in netloc:
            if 'https://twitter.com/' in info['webpage_url']:
                logger.info('Found https://twitter.com/ in the download url from Twitter')
@ -41,7 +46,6 @@ class YoutubeDLArchiver(Archiver):
                logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
                return False

-
        if check_if_exists:
            if 'entries' in info:
                if len(info['entries']) > 1:
@ -81,10 +85,12 @@ class YoutubeDLArchiver(Archiver):

        if status != 'already archived':
            key = self.get_key(filename)
-            cdn_url = self.storage.get_cdn_url(key)
-
            self.storage.upload(filename, key)

+            # filename ='tmp/sDE-qZdi8p8.webm'
+            # key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
+            cdn_url = self.storage.get_cdn_url(key)
+
        hash = self.get_hash(filename)
        screenshot = self.get_screenshot(url)

@ -102,9 +108,9 @@ class YoutubeDLArchiver(Archiver):

        timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
            if 'timestamp' in info else \
-                datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
+            datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
            if 'upload_date' in info and info['upload_date'] is not None else \
-                None
+            None

        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
                             title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)
--- a/auto_archive.py
+++ b/auto_archive.py
@ -9,6 +9,11 @@ import traceback
 from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
 from utils import GWorksheet, mkdir_if_not_exists, expand_url
 from configs import Config
+import archivers
+from storages import S3Storage, S3Config
+from storages.gd_storage import GDConfig, GDStorage
+from utils import GWorksheet, mkdir_if_not_exists
+import sys

 logger.add("logs/1trace.log", level="TRACE")
 logger.add("logs/2info.log", level="INFO")
@ -56,6 +61,25 @@ def update_sheet(gw, row, result: ArchiveResult):
 def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
    sh = c.gsheets_client.open(sheet)

+
+def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
+    gc = gspread.service_account(filename='service_account.json')
+    sh = gc.open(sheet)
+
+    s3_config = S3Config(
+        bucket=os.getenv('DO_BUCKET'),
+        region=os.getenv('DO_SPACES_REGION'),
+        key=os.getenv('DO_SPACES_KEY'),
+        secret=os.getenv('DO_SPACES_SECRET')
+    )
+    gd_config = GDConfig(
+        root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'),
+    )
+    telegram_config = archivers.TelegramConfig(
+        api_id=os.getenv('TELEGRAM_API_ID'),
+        api_hash=os.getenv('TELEGRAM_API_HASH')
+    )
+
    # loop through worksheets to check
    for ii, wks in enumerate(sh.worksheets()):
        logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}')
@ -75,16 +99,22 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
        c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/')
        storage = c.get_storage()

+        gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
+        gd_client = GDStorage(gd_config)

        # loop through rows in worksheet
        for row in range(1 + header, gw.count_rows() + 1):
            url = gw.get_cell(row, 'url')
            original_status = gw.get_cell(row, 'status')
            status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
+
            if url != '' and status in ['', None]:
                gw.set_cell(row, 'status', 'Archive in progress')

                url = expand_url(url)
+
+                subfolder = gw.get_cell_or_default(row, 'subfolder')
+
                # make a new driver so each spreadsheet row is idempotent
                c.recreate_webdriver()

@ -98,16 +128,35 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
                    WaybackArchiver(storage, c.webdriver, c.wayback_config)
                ]

+                storage_client = None
+                if storage == "s3":
+                    storage_client = s3_client
+                elif storage == "gd":
+                    storage_client = gd_client
+                else:
+                    raise ValueError(f'Cant get storage_client {storage_client}')
+                storage_client.update_properties(subfolder=subfolder)
                for archiver in active_archivers:
                    logger.debug(f'Trying {archiver} on row {row}')

                    try:
                        result = archiver.download(url, check_if_exists=True)
+                    except KeyboardInterrupt:
+                        logger.warning("caught interrupt")
+                        gw.set_cell(row, 'status', '')
+                        driver.quit()
+                        exit()
                    except Exception as e:
                        result = False
                        logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')

                    if result:
+                        # IA is a Success I believe - or do we want to display a logger warning for it?
+                        if result.status in ['success', 'already archived', 'Internet Archive fallback']:
+                            result.status = archiver.name + \
+                                ": " + str(result.status)
+                            logger.success(
+                                f'{archiver} succeeded on row {row}, url {url}')
                        if result.status in ['success', 'already archived']:
                            result.status = f"{archiver.name}: {result.status}"
                            logger.success(f'{archiver} succeeded on row {row}')
@ -115,6 +164,21 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
                        logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
                        result.status = f"{archiver.name}: {result.status}"

+
+                        # wayback has seen this url before so keep existing status
+                        if "wayback: Internet Archive fallback" in result.status:
+                            logger.success(
+                                f'wayback has seen this url before so keep existing status on row {row}')
+                            result.status = result.status.replace(' (duplicate)', '')
+                            result.status = str(result.status) + " (duplicate)"
+                            break
+
+                        logger.warning(
+                            f'{archiver} did not succeed on {row=}, final status: {result.status}')
+                        result.status = archiver.name + \
+                            ": " + str(result.status)
+                # get rid of driver so can reload on next row
+                driver.quit()
                if result:
                    update_sheet(gw, row, result)
                else:
@ -129,12 +193,25 @@ def main():
    c.parse()

    logger.info(f'Opening document {c.sheet} for header {c.header}')
+    parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"})
+
+    for k, v in GWorksheet.COLUMN_NAMES.items():
+        help = f"the name of the column to fill with {k} (defaults={v})"
+        if k == "subfolder":
+            help = f"the name of the column to read the {k} from (defaults={v})"
+        parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help)

    mkdir_if_not_exists(c.tmp_folder)
    process_sheet(c, c.sheet, header=c.header, columns=c.column_names)
    shutil.rmtree(c.tmp_folder)
    c.destroy_webdriver()

+    logger.info(f'Opening document {args.sheet} for header {args.header}')
+
+    mkdir_if_not_exists('tmp')
+    process_sheet(args.sheet, header=args.header, columns=config_columns)
+    shutil.rmtree('tmp')
+

 if __name__ == '__main__':
    main()
--- a/storages/base_storage.py
+++ b/storages/base_storage.py
@ -1,5 +1,6 @@
 from loguru import logger
 from abc import ABC, abstractmethod
+from pathlib import Path


 class Storage(ABC):
@ -20,3 +21,25 @@ class Storage(ABC):
        logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
        with open(filename, 'rb') as f:
            self.uploadf(f, key, **kwargs)
+
+    def update_properties(self, **kwargs):
+        """
+        method used to update general properties that some children may use 
+        and others not, but that all can call
+        """
+        for k, v in kwargs.items():
+            if k in self.get_allowed_properties():
+                setattr(self, k, v)
+            else:
+                logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
+
+    def get_allowed_properties(self):
+        """
+        child classes should specify which properties they allow to be set
+        """
+        return set(["subfolder"])
+
+    def clean_path(self, folder, default="", add_forward_slash=True):
+        if folder is None or type(folder) != str or len(folder.strip()) == 0:
+            return default
+        return str(Path(folder)) + ("/" if add_forward_slash else "")
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@ -0,0 +1,187 @@
+from loguru import logger
+from .base_storage import Storage
+from dataclasses import dataclass
+
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+from google.oauth2 import service_account
+
+import time
+
+
+@dataclass
+class GDConfig:
+    root_folder_id: str
+
+
+class GDStorage(Storage):
+    DEFAULT_UPLOAD_FOLDER_NAME = "default"
+
+    def __init__(self, config: GDConfig):
+        self.root_folder_id = config.root_folder_id
+        SCOPES = ['https://www.googleapis.com/auth/drive']
+        creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
+        self.service = build('drive', 'v3', credentials=creds)
+
+    def get_cdn_url(self, key):
+        """
+        only support files saved in a folder for GD
+        S3 supports folder and all stored in the root
+        """
+        self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
+        filename = key
+        logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
+
+        # retry policy on Google Drive
+        try_again = True
+        counter = 1
+        folder_id = None
+        while try_again:
+            # need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
+            results = self.service.files().list(
+                q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ",
+                spaces='drive',  # ie not appDataFolder or photos
+                fields='files(id, name)'
+            ).execute()
+            items = results.get('files', [])
+
+            for item in items:
+                logger.debug(f"found folder of {item['name']}")
+                folder_id = item['id']
+                try_again = False
+
+            if folder_id is None:
+                logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}')
+                counter += 1
+                time.sleep(10)
+                if counter > 18:
+                    raise ValueError(f'Cannot find  {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes')
+
+        # check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
+        # happens doing thumbnails
+        a, _, b = filename.partition('/')
+
+        if b != '':
+            # a: 'youtube_dl_sDE-qZdi8p8'
+            # b: 'index.html'
+            logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
+
+            # get id of the sub folder
+            results = self.service.files().list(
+                q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
+                spaces='drive',  # ie not appDataFolder or photos
+                fields='files(id, name)'
+            ).execute()
+            items = results.get('files', [])
+
+            filename = None
+            for item in items:
+                folder_id = item['id']
+                filename = b
+            if filename is None:
+                raise ValueError(f'Problem finding sub folder {a}')
+
+        # get id of file inside folder (or sub folder)
+        results = self.service.files().list(
+            q=f"'{folder_id}' in parents and name = '{filename}' ",
+            spaces='drive',
+            fields='files(id, name)'
+        ).execute()
+        items = results.get('files', [])
+
+        file_id = None
+        for item in items:
+            logger.debug(f"found file of {item['name']}")
+            file_id = item['id']
+
+        if file_id is None:
+            raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
+
+        foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
+        return foo
+
+    def exists(self, _key):
+        # TODO: How to check for google drive, as it accepts different names
+        return False
+
+    def uploadf(self, file, key, **_kwargs):
+        logger.debug(f"before {self.subfolder=}")
+        self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
+        filename = key
+        logger.debug(f"after {self.subfolder=}")
+        # does folder eg SM0005 exist already inside parent of Files auto-archiver
+        results = self.service.files().list(
+            q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ",
+            spaces='drive',
+            fields='files(id, name)'
+        ).execute()
+        items = results.get('files', [])
+        folder_id_to_upload_to = None
+        if len(items) > 1:
+            logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway')
+
+        for item in items:
+            logger.debug(f"Found existing folder of {item['name']}")
+            folder_id_to_upload_to = item['id']
+
+        if folder_id_to_upload_to is None:
+            logger.debug(f'Creating new folder {self.subfolder}')
+            file_metadata = {
+                'name': [self.subfolder],
+                'mimeType': 'application/vnd.google-apps.folder',
+                'parents': [self.root_folder_id]
+            }
+            gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
+            folder_id_to_upload_to = gd_file.get('id')
+
+        # check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails
+        # will always return a and a blank b even if there is nothing to split
+        # https://stackoverflow.com/a/38149500/26086
+        a, _, b = filename.partition('/')
+
+        if b != '':
+            # a: 'youtube_dl_sDE-qZdi8p8'
+            # b: 'out1.jpg'
+            logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
+
+            # does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
+            results = self.service.files().list(
+                q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
+                spaces='drive',  # ie not appDataFolder or photos
+                fields='files(id, name)'
+            ).execute()
+            items = results.get('files', [])
+            sub_folder_id_to_upload_to = None
+            if len(items) > 1:
+                logger.error(f'Duplicate folder name of {a} which should never happen')
+
+            for item in items:
+                logger.debug(f"Found existing folder of {item['name']}")
+                sub_folder_id_to_upload_to = item['id']
+
+            if sub_folder_id_to_upload_to is None:
+                # create new folder
+                file_metadata = {
+                    'name': [a],
+                    'mimeType': 'application/vnd.google-apps.folder',
+                    'parents': [folder_id_to_upload_to]
+                }
+                gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
+                sub_folder_id_to_upload_to = gd_file.get('id')
+
+            filename = b
+            folder_id_to_upload_to = sub_folder_id_to_upload_to
+            # back to normal control flow
+
+        # upload file to gd
+        file_metadata = {
+            'name': [filename],
+            'parents': [folder_id_to_upload_to]
+        }
+        media = MediaFileUpload(file, resumable=True)
+        gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
+
+    def upload(self, filename: str, key: str, **kwargs):
+        # GD only requires the filename not a file reader
+        logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
+        self.uploadf(filename, key, **kwargs)
--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@ -5,6 +5,8 @@ import boto3
 from botocore.errorfactory import ClientError

 from .base_storage import Storage
+from dataclasses import dataclass
+from loguru import logger


@dataclass
@ -26,20 +28,11 @@ class S3Storage(Storage):
    def __init__(self, config: S3Config):
        self.bucket = config.bucket
        self.region = config.region
+        self.folder = self.clean_path(config.folder)
        self.private = config.private
        self.cdn_url = config.cdn_url
        self.key_path = config.key_path

-        if config.no_folder:
-            self.folder = ""
-        else:
-            self.folder = config.folder
-            if len(self.folder) and self.folder[-1] != '/':
-                self.folder += '/'
-
-        if self.key_path == "random":
-            self.key_dict = {}  # key => randomKey
-
        self.s3 = boto3.client(
            's3',
            region_name=config.region,
@ -62,6 +55,7 @@ class S3Storage(Storage):
                self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
            final_key = self.key_dict[key]
        return self.folder + final_key
+        return self.folder + self.clean_path(self.subfolder) + key

    def get_cdn_url(self, key):
        return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
@ -74,9 +68,9 @@ class S3Storage(Storage):
            return False

    def uploadf(self, file, key, **kwargs):
+        logger.debug(f'[S3 storage] uploading {file=}, {key=}')
        if self.private:
            extra_args = kwargs.get("extra_args", {})
        else:
            extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
-
        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
--- a/utils/gworksheet.py
+++ b/utils/gworksheet.py
@ -10,6 +10,7 @@ class GWorksheet:
    """
    COLUMN_NAMES = {
        'url': 'link',
+        'subfolder': 'sub folder',
        'archive': 'archive location',
        'date': 'archive date',
        'status': 'archive status',
@ -71,6 +72,15 @@ class GWorksheet:
            return ''
        return row[col_index]

+    def get_cell_or_default(self, row, col: str, default: str = None, fresh=False):
+        """
+        return self.get_cell or default value on error (eg: column is missing)
+        """
+        try:
+            return self.get_cell(row, col, fresh)
+        except:
+            return default
+
    def set_cell(self, row: int, col: str, val):
        # row is 1-based
        col_index = self._col_index(col) + 1