diff --git a/.example.env b/.example.env
index 4a200cf..94d25af 100644
--- a/.example.env
+++ b/.example.env
@@ -7,4 +7,12 @@ INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
-FACEBOOK_COOKIE=cookie: datr= xxxx
\ No newline at end of file
+FACEBOOK_COOKIE=cookie: datr= xxxx
+
+# Google Drive, Right click on folder, Get link, eg
+# https://drive.google.com/drive/folders/1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X?usp=sharing
+# we want: 1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X
+GD_ROOT_FOLDER_ID=
+
+# Remeber to share the folder with the service eg
+# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com
diff --git a/.gitignore b/.gitignore
index 9d83858..f76014b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ __pycache__/
anu.html
*.log
.pytest_cach
-anon*
\ No newline at end of file
+
+anon*
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..1a82e0c
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,27 @@
+{
+ // Use IntelliSense to learn about possible attributes.
+ // Hover to view descriptions of existing attributes.
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Test Hashing",
+ "type": "python",
+ "request": "launch",
+ "program": "auto_archive.py",
+ "console": "integratedTerminal",
+ "justMyCode": true,
+ // "args": ["--sheet","Test Hashing"]
+ // "args": ["--sheet","Test Hashing","--use-filenumber-as-directory"]
+ "args": ["--sheet","Test Hashing","--use-filenumber-as-directory", "--storage=gd"]
+ },
+ {
+ "name": "Python: Current File",
+ "type": "python",
+ "request": "launch",
+ "program": "${file}",
+ "console": "integratedTerminal",
+ "justMyCode": true
+ }
+ ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 5854f68..0f1b2e1 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# auto-archiver
-This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
+This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
## Setup
@@ -14,7 +14,7 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
-A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
+A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```
DO_SPACES_REGION=
@@ -23,8 +23,14 @@ DO_SPACES_KEY=
DO_SPACES_SECRET=
INTERNET_ARCHIVE_S3_KEY=
INTERNET_ARCHIVE_S3_SECRET=
+TELEGRAM_API_ID=
+TELEGRAM_API_HASH=
+FACEBOOK_COOKIE=
+GD_ROOT_FOLDER_ID=
```
+`.example.env` is an example of this file
+
Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
## Running
@@ -93,3 +99,29 @@ graph TD
graph TD
A(BaseStorage) -->|parent of| B(S3Storage)
```
+
+## Saving into Folders
+
+To use a column from the spreadsheet called `File Number` eg SM001234 as a directory on the cloud storage, you need to pass in
+
+```bash
+python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory
+```
+
+## Google Drive
+
+To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
+
+```bash
+python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory --storage='gd'
+```
+
+Note the you must use filenumber for Google Drive Storage.
+
+## Telethon (Telegrams API Library)
+
+Put your `anon.session` in the root, so that it doesn't stall and ask for authentication
+
+
+
+
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 7ab5a9c..367b483 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -14,6 +14,9 @@ from selenium.common.exceptions import TimeoutException
from storages import Storage
from utils import mkdir_if_not_exists
+from selenium.webdriver.common.by import By
+from loguru import logger
+from selenium.common.exceptions import TimeoutException
@dataclass
class ArchiveResult:
@@ -39,7 +42,7 @@ class Archiver(ABC):
return self.__class__.__name__
@abstractmethod
- def download(self, url, check_if_exists=False): pass
+ def download(self, url, check_if_exists=False, filenumber=None): pass
def get_netloc(self, url):
return urlparse(url).netloc
@@ -47,7 +50,8 @@ class Archiver(ABC):
def get_html_key(self, url):
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
- def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
+ # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
+ def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None):
page = f'''
{url}
Archived media from {self.name}
@@ -61,18 +65,24 @@ class Archiver(ABC):
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = 'tmp/' + page_key
- page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f:
f.write(page)
page_hash = self.get_hash(page_filename)
+ if filenumber != None:
+ logger.trace(f'filenumber for directory is {filenumber}')
+ page_key = filenumber + "/" + page_key
+
self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
+
+ page_cdn = self.storage.get_cdn_url(page_key)
return (page_cdn, page_hash, thumbnail)
- def generate_media_page(self, urls, url, object):
+ # eg images in a tweet save to cloud storage
+ def generate_media_page(self, urls, url, object, filenumber=None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
@@ -87,19 +97,30 @@ class Archiver(ABC):
filename = 'tmp/' + key
+ # eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f:
f.write(d.content)
+ if filenumber is not None:
+ logger.debug(f'filenumber for directory is {filenumber}')
+ key = filenumber + "/" + key
+
+ # eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
+ # eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
+ # or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
self.storage.upload(filename, key)
+
hash = self.get_hash(filename)
+
+ # eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
cdn_url = self.storage.get_cdn_url(key)
if thumbnail is None:
thumbnail = cdn_url
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
- return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
+ return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber)
def get_key(self, filename):
"""
@@ -119,15 +140,33 @@ class Archiver(ABC):
def get_hash(self, filename):
f = open(filename, "rb")
bytes = f.read() # read entire file as bytes
+
hash = hashlib.sha256(bytes)
+ # option to use SHA3_512 instead
+ # hash = hashlib.sha3_512(bytes)
f.close()
return hash.hexdigest()
- def get_screenshot(self, url):
+ # eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png
+ # def get_screenshot(self, url, filenumber, storage="GD"):
+ def get_screenshot(self, url, filenumber):
key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key
+ # Accept cookies popup dismiss for ytdlp video
+ if 'facebook.com' in url:
+ try:
+ logger.debug(f'Trying fb click accept cookie popup for {url}')
+ self.driver.get("http://www.facebook.com")
+ foo = self.driver.find_element(By.XPATH,"//button[@data-cookiebanner='accept_only_essential_button']")
+ foo.click()
+ logger.debug(f'fb click worked')
+ # linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
+ time.sleep(2)
+ except:
+ logger.warning(f'Failed on fb accept cookies for url {url}')
+
try:
self.driver.get(url)
time.sleep(6)
@@ -135,8 +174,14 @@ class Archiver(ABC):
logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename)
+
+ if filenumber is not None:
+ logger.debug(f'filenumber for directory is {filenumber}')
+ key = filenumber + "/" + key
+
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'})
+
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None):
@@ -167,10 +212,9 @@ class Archiver(ABC):
thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname
- cdn_url = self.storage.get_cdn_url(key)
-
self.storage.upload(thumbnail_filename, key)
+ cdn_url = self.storage.get_cdn_url(key)
cdn_urls.append(cdn_url)
if len(cdn_urls) == 0:
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index 5a7f63c..b19ab8f 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult
class TelegramArchiver(Archiver):
name = "telegram"
- def download(self, url, check_if_exists=False):
+ def download(self, url, check_if_exists=False, filenumber=None):
# detect URLs that we definitely cannot handle
if 't.me' != self.get_netloc(url):
return False
@@ -27,7 +27,7 @@ class TelegramArchiver(Archiver):
if url[-8:] != "?embed=1":
url += "?embed=1"
- screenshot = self.get_screenshot(url)
+ screenshot = self.get_screenshot(url, filenumber=filenumber)
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
@@ -42,7 +42,7 @@ class TelegramArchiver(Archiver):
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
images += urls
- page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))
+ page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber)
time_elements = s.find_all('time')
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
@@ -52,6 +52,9 @@ class TelegramArchiver(Archiver):
video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id)
+ if filenumber is not None:
+ key = filenumber + "/" + key
+
filename = 'tmp/' + key
cdn_url = self.storage.get_cdn_url(key)
diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py
index 88bec58..5cee791 100644
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@@ -41,20 +41,22 @@ class TelethonArchiver(Archiver):
media.append(post)
return media
- def download(self, url, check_if_exists=False):
+ def download(self, url, check_if_exists=False, filenumber=None):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
if not len(matches):
return False
status = "success"
- screenshot = self.get_screenshot(url)
+ screenshot = self.get_screenshot(url, filenumber)
+ # app will ask (stall for user input!) for phone number and auth code if anon.session not found
with self.client.start():
matches = list(matches[0])
chat, post_id = matches[1], matches[2]
post_id = int(post_id)
+
try:
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
@@ -65,9 +67,13 @@ class TelethonArchiver(Archiver):
if len(media_posts) > 1:
key = self.get_html_key(url)
- cdn_url = self.storage.get_cdn_url(key)
+
+ if filenumber is not None:
+ key = filenumber + "/" + key
if check_if_exists and self.storage.exists(key):
+ # only s3 storage supports storage.exists as not implemented on gd
+ cdn_url = self.storage.get_cdn_url(key)
status = 'already archived'
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
@@ -78,19 +84,26 @@ class TelethonArchiver(Archiver):
if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1]
+
+ if filenumber is not None:
+ key = filenumber + "/" + key
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
os.remove(filename)
- page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
+ page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)), filenumber=filenumber)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'tmp/{key}')
key = filename.split('tmp/')[1].replace(" ", "")
+
+ if filenumber is not None:
+ key = filenumber + "/" + key
+
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
@@ -99,5 +112,5 @@ class TelethonArchiver(Archiver):
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
- page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
+ page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)), filenumber=filenumber)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index 6b5116f..9b90efa 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult
class TiktokArchiver(Archiver):
name = "tiktok"
- def download(self, url, check_if_exists=False):
+ def download(self, url, check_if_exists=False, filenumber=None):
if 'tiktok.com' not in url:
return False
@@ -54,11 +54,13 @@ class TiktokArchiver(Archiver):
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
hash=hash, screenshot=screenshot)
- except tiktok_downloader.Except.InvalidUrl:
+ except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL'
+ logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
return ArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
+ logger.warning(f'Other Tiktok error' + str(error))
return ArchiveResult(status=status)
diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py
index 099d279..05e7ec0 100644
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@@ -1,6 +1,5 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger
-import requests
from urllib.parse import urlparse
from .base_archiver import Archiver, ArchiveResult
@@ -9,7 +8,8 @@ from .base_archiver import Archiver, ArchiveResult
class TwitterArchiver(Archiver):
name = "twitter"
- def download(self, url, check_if_exists=False):
+ def download(self, url, check_if_exists=False, filenumber=None):
+
if 'twitter.com' != self.get_netloc(url):
return False
@@ -24,11 +24,14 @@ class TwitterArchiver(Archiver):
try:
tweet = next(scr.get_items())
- except:
- logger.warning('wah wah')
+ except Exception as ex:
+ template = "TwitterArchiver cant get tweet and threw, which can happen if a media sensitive tweet. \n type: {0} occurred. \n arguments:{1!r}"
+ message = template.format(type(ex).__name__, ex.args)
+ logger.warning(message)
return False
if tweet.media is None:
+ logger.trace(f'No media found')
return False
urls = []
@@ -45,8 +48,8 @@ class TwitterArchiver(Archiver):
else:
logger.warning(f"Could not get media URL of {media}")
- page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
+ page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber)
- screenshot = self.get_screenshot(url)
+ screenshot = self.get_screenshot(url, filenumber)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index 1fa98aa..652798a 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
+from loguru import logger
+
class WaybackArchiver(Archiver):
name = "wayback"
@@ -12,7 +14,7 @@ class WaybackArchiver(Archiver):
super(WaybackArchiver, self).__init__(storage, driver)
self.seen_urls = {}
- def download(self, url, check_if_exists=False):
+ def download(self, url, check_if_exists=False, filenumber=None):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
@@ -25,9 +27,11 @@ class WaybackArchiver(Archiver):
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
+ logger.warning(f"Internet archive failed with status of {r.status_code}")
return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
+ logger.warning(f"Internet archive failed json \n {r.json()}")
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id']
@@ -71,7 +75,7 @@ class WaybackArchiver(Archiver):
except:
title = "Could not get title"
- screenshot = self.get_screenshot(url)
+ screenshot = self.get_screenshot(url, filenumber)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
self.seen_urls[url] = result
return result
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index ad8756b..9983950 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -15,7 +15,7 @@ class YoutubeDLArchiver(Archiver):
super().__init__(storage, driver)
self.fb_cookie = fb_cookie
- def download(self, url, check_if_exists=False):
+ def download(self, url, check_if_exists=False, filenumber=None):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'www.facebook.com']:
logger.debug('Using Facebook cookie')
@@ -27,13 +27,17 @@ class YoutubeDLArchiver(Archiver):
try:
info = ydl.extract_info(url, download=False)
- except yt_dlp.utils.DownloadError:
- # no video here
+ except yt_dlp.utils.DownloadError as e:
+ logger.debug(f'No video - Youtube normal control flow: {e}')
+ return False
+ except Exception as e:
+ logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}')
return False
if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
+
if 'twitter.com' in netloc:
if 'https://twitter.com/' in info['webpage_url']:
logger.info('Found https://twitter.com/ in the download url from Twitter')
@@ -41,7 +45,6 @@ class YoutubeDLArchiver(Archiver):
logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
return False
-
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
@@ -58,6 +61,9 @@ class YoutubeDLArchiver(Archiver):
key = self.get_key(filename)
+ if filenumber is not None:
+ key = filenumber + "/" + key
+
if self.storage.exists(key):
status = 'already archived'
cdn_url = self.storage.get_cdn_url(key)
@@ -81,12 +87,19 @@ class YoutubeDLArchiver(Archiver):
if status != 'already archived':
key = self.get_key(filename)
- cdn_url = self.storage.get_cdn_url(key)
+
+ if filenumber is not None:
+ key = filenumber + "/" + key
self.storage.upload(filename, key)
+ # filename ='tmp/sDE-qZdi8p8.webm'
+ # key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
+ cdn_url = self.storage.get_cdn_url(key)
+
hash = self.get_hash(filename)
- screenshot = self.get_screenshot(url)
+ screenshot = self.get_screenshot(url, filenumber)
+
# get duration
duration = info.get('duration')
diff --git a/auto_archive.py b/auto_archive.py
index c0ae085..a3c17d1 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -68,7 +68,7 @@ def expand_url(url):
return url
-def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
+def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
@@ -78,6 +78,9 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET')
)
+ gd_config = GDConfig(
+ root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'),
+ )
telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH')
@@ -91,12 +94,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
gw = GWorksheet(wks, header_row=header, columns=columns)
if not gw.col_exists('url'):
- logger.warning(
+ logger.info(
f'No "{columns["url"]}" column found, skipping worksheet {wks.title}')
continue
if not gw.col_exists('status'):
- logger.warning(
+ logger.info(
f'No "{columns["status"]}" column found, skipping worksheet {wks.title}')
continue
@@ -104,26 +107,30 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
s3_client = S3Storage(s3_config)
- # order matters, first to succeed excludes remaining
- active_archivers = [
- archivers.TelethonArchiver(s3_client, driver, telegram_config),
- archivers.TelegramArchiver(s3_client, driver),
- archivers.TiktokArchiver(s3_client, driver),
- archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')),
- archivers.TwitterArchiver(s3_client, driver),
- archivers.WaybackArchiver(s3_client, driver)
- ]
+ gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
+ gd_client = GDStorage(gd_config)
# loop through rows in worksheet
for row in range(1 + header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
+
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
+ if usefilenumber:
+ filenumber = gw.get_cell(row, 'filenumber')
+ logger.debug(f'filenumber is {filenumber}')
+ if filenumber == "":
+ logger.warning(f'Logic error on row {row} with url {url} - the feature flag for usefilenumber is True, yet cant find a corresponding filenumber')
+ gw.set_cell(row, 'status', 'Missing filenumber')
+ continue
+ else:
+ # We will use this through the app to differentiate between where to save
+ filenumber = None
# make a new driver so each spreadsheet row is idempotent
options = webdriver.FirefoxOptions()
@@ -134,24 +141,58 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
driver.set_window_size(1400, 2000)
# in seconds, telegram screenshots catch which don't come back
driver.set_page_load_timeout(120)
+
+ # client
+ storage_client = None
+ if storage == "s3":
+ storage_client = s3_client
+ elif storage == "gd":
+ storage_client = gd_client
+ else:
+ raise ValueError(f'Cant get storage_client {storage_client}')
+
+ # order matters, first to succeed excludes remaining
+ active_archivers = [
+ archivers.TelethonArchiver(storage_client, driver, telegram_config),
+ archivers.TelegramArchiver(storage_client, driver),
+ archivers.TiktokArchiver(storage_client, driver),
+ archivers.YoutubeDLArchiver(storage_client, driver, os.getenv('FACEBOOK_COOKIE')),
+ archivers.TwitterArchiver(storage_client, driver),
+ archivers.WaybackArchiver(storage_client, driver)
+ ]
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
try:
- result = archiver.download(url, check_if_exists=True)
+ if usefilenumber:
+ # using filenumber to store in folders so not checking for existence of that url
+ result = archiver.download(url, check_if_exists=False, filenumber=filenumber)
+ else:
+ result = archiver.download(url, check_if_exists=True)
+
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
if result:
- if result.status in ['success', 'already archived']:
+ # IA is a Success I believe - or do we want to display a logger warning for it?
+ if result.status in ['success', 'already archived', 'Internet Archive fallback']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
- f'{archiver} succeeded on row {row}')
+ f'{archiver} succeeded on row {row}, url {url}')
break
+
+ # wayback has seen this url before so keep existing status
+ if "wayback: Internet Archive fallback" in result.status:
+ logger.success(
+ f'wayback has seen this url before so keep existing status on row {row}')
+ result.status = result.status.replace(' (duplicate)', '')
+ result.status = str(result.status) + " (duplicate)"
+ break
+
logger.warning(
- f'{archiver} did not succeed on row {row}, final status: {result.status}')
+ f'{archiver} did not succeed on {row=}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
# get rid of driver so can reload on next row
@@ -165,22 +206,34 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
@logger.catch
def main():
logger.debug(f'Passed args:{sys.argv}')
+
parser = argparse.ArgumentParser(
description='Automatically archive social media videos from a Google Sheets document')
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True)
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
+ parser.add_argument('--use-filenumber-as-directory', action=argparse.BooleanOptionalAction, dest='usefilenumber', \
+ help='Will save files into a subfolder on cloud storage which has the File Number eg SM3012')
+ parser.add_argument('--storage', action='store', dest='storage', default='s3', \
+ help='s3 or gd storage. Default is s3. NOTE GD storage supports only using filenumber')
+
for k, v in GWorksheet.COLUMN_NAMES.items():
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})')
args = parser.parse_args()
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
- logger.info(f'Opening document {args.sheet} for header {args.header}')
+ logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber} and storage {args.storage}')
+
+ # https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+ # args.filenumber is True (of type bool) when set or None when argument is not there
+ usefilenumber = False
+ if args.usefilenumber:
+ usefilenumber = True
mkdir_if_not_exists('tmp')
- process_sheet(args.sheet, header=args.header, columns=config_columns)
+ process_sheet(args.sheet, usefilenumber, args.storage, args.header, config_columns)
shutil.rmtree('tmp')
diff --git a/storages/base_storage.py b/storages/base_storage.py
index e1bf9c7..79a555b 100644
--- a/storages/base_storage.py
+++ b/storages/base_storage.py
@@ -17,5 +17,12 @@ class Storage(ABC):
def upload(self, filename: str, key: str, **kwargs):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
- with open(filename, 'rb') as f:
- self.uploadf(f, key, **kwargs)
+ # S3 requires an open file, GD only the filename
+ storage = type(self).__name__
+ if storage == "GDStorage":
+ self.uploadf(filename, key, **kwargs)
+ elif storage == "S3Storage":
+ with open(filename, 'rb') as f:
+ self.uploadf(f, key, **kwargs)
+ else:
+ raise ValueError('Cant get storage thrown from base_storage.py')
\ No newline at end of file
diff --git a/storages/gd_storage.py b/storages/gd_storage.py
index e69de29..4dab7d0 100644
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@@ -0,0 +1,202 @@
+from loguru import logger
+from .base_storage import Storage
+from dataclasses import dataclass
+
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+from google.oauth2 import service_account
+
+import time
+
+@dataclass
+class GDConfig:
+ root_folder_id: str
+
+class GDStorage(Storage):
+
+ def __init__(self, config: GDConfig):
+ self.root_folder_id = config.root_folder_id
+ SCOPES = ['https://www.googleapis.com/auth/drive']
+ creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
+ self.service = build('drive', 'v3', credentials=creds)
+
+ def _get_path(self, key):
+ return self.folder + key
+
+ def get_cdn_url(self, key):
+ # only support files saved in a folders for GD
+ # S3 supports folder and all stored in the root
+
+ # key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg
+ foldername = key.split('/', 1)[0]
+ # eg twitter__media_asdf.jpg
+ filename = key.split('/', 1)[1]
+
+ logger.debug(f'Looking for {foldername} and filename: {filename} on GD')
+
+ # retry policy on Google Drive
+ try_again = True
+ counter = 1
+ folder_id = None
+ while try_again:
+ # need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
+ results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
+ and name = '{foldername}' ",
+ spaces='drive', # ie not appDataFolder or photos
+ fields='files(id, name)'
+ ).execute()
+ items = results.get('files', [])
+
+ for item in items:
+ logger.debug(f"found folder of {item['name']}")
+ folder_id= item['id']
+ try_again = False
+
+ if folder_id is None:
+ logger.debug(f'Cant find {foldername=} waiting and trying again {counter=}')
+ counter += 1
+ time.sleep(10)
+ if counter > 18:
+ raise ValueError(f'Cant find {foldername} and retried 18 times pausing 10seconds at a time which is 3 minutes')
+
+ # check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
+ # happens doing thumbnails
+ a, _, b = filename.partition('/')
+
+ if b != '':
+ # a: 'youtube_dl_sDE-qZdi8p8'
+ # b: 'index.html'
+ logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
+
+ # get id of the sub folder
+ results = self.service.files().list(q=f"'{folder_id}' in parents \
+ and mimeType='application/vnd.google-apps.folder' \
+ and name = '{a}' ",
+ spaces='drive', # ie not appDataFolder or photos
+ fields='files(id, name)'
+ ).execute()
+ items = results.get('files', [])
+
+ filename = None
+ for item in items:
+ folder_id = item['id']
+ filename = b
+ if filename is None:
+ raise ValueError(f'Problem finding sub folder {a}')
+
+ # get id of file inside folder (or sub folder)
+ results = self.service.files().list(q=f"'{folder_id}' in parents \
+ and name = '{filename}' ",
+ spaces='drive',
+ fields='files(id, name)'
+ ).execute()
+ items = results.get('files', [])
+
+ file_id = None
+ for item in items:
+ logger.debug(f"found file of {item['name']}")
+ file_id= item['id']
+
+ if file_id is None:
+ raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
+
+ foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
+ return foo
+
+ def exists(self, key):
+ # Not implemented yet
+ # Google drive will accept duplicate named filenames as it is stored as a different fileid
+
+ # try:
+ # self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
+ # return True
+ # except ClientError:
+ # return False
+ return False
+
+ def uploadf(self, file, key, **kwargs):
+ # split on first occurance of /
+ # eg SM0005
+ foldername = key.split('/', 1)[0]
+ # eg twitter__media_asdf.jpg
+ filename = key.split('/', 1)[1]
+
+ # does folder eg SM0005 exist already inside parent of Files auto-archiver
+ results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
+ and mimeType='application/vnd.google-apps.folder' \
+ and name = '{foldername}' ",
+ spaces='drive',
+ fields='files(id, name)'
+ ).execute()
+ items = results.get('files', [])
+ folder_id_to_upload_to = None
+ if len(items) > 1:
+ logger.error(f'Duplicate folder name of {foldername} which should never happen, but continuing anyway')
+
+ for item in items:
+ logger.debug(f"Found existing folder of {item['name']}")
+ folder_id_to_upload_to = item['id']
+
+ if folder_id_to_upload_to is None:
+ logger.debug(f'Creating new folder {foldername}')
+ file_metadata = {
+ 'name': [foldername],
+ 'mimeType': 'application/vnd.google-apps.folder',
+ 'parents': [self.root_folder_id]
+ }
+ gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
+ folder_id_to_upload_to = gd_file.get('id')
+
+ # check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg'
+ # happens doing thumbnails
+
+ # will always return a and a blank b even if there is nothing to split
+ # https://stackoverflow.com/a/38149500/26086
+ a, _, b = filename.partition('/')
+
+ if b != '':
+ # a: 'youtube_dl_sDE-qZdi8p8'
+ # b: 'out1.jpg'
+ logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
+
+ # does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
+ results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \
+ and mimeType='application/vnd.google-apps.folder' \
+ and name = '{a}' ",
+ spaces='drive', # ie not appDataFolder or photos
+ fields='files(id, name)'
+ ).execute()
+ items = results.get('files', [])
+ sub_folder_id_to_upload_to = None
+ if len(items) > 1:
+ logger.error(f'Duplicate folder name of {a} which should never happen')
+
+ for item in items:
+ logger.debug(f"Found existing folder of {item['name']}")
+ sub_folder_id_to_upload_to = item['id']
+
+ if sub_folder_id_to_upload_to is None:
+ # create new folder
+ file_metadata = {
+ 'name': [a],
+ 'mimeType': 'application/vnd.google-apps.folder',
+ 'parents': [folder_id_to_upload_to]
+ }
+ gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
+ sub_folder_id_to_upload_to = gd_file.get('id')
+
+ filename = b
+ folder_id_to_upload_to = sub_folder_id_to_upload_to
+ # back to normal control flow
+
+ # else:
+ # upload file to gd
+ file_metadata = {
+ # 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
+ 'name': [filename],
+ 'parents': [folder_id_to_upload_to]
+ }
+ media = MediaFileUpload(file, resumable=True)
+ gd_file = self.service.files().create(body=file_metadata,
+ media_body=media,
+ fields='id').execute()
diff --git a/utils/gworksheet.py b/utils/gworksheet.py
index 6dec9b2..42afe04 100644
--- a/utils/gworksheet.py
+++ b/utils/gworksheet.py
@@ -9,6 +9,7 @@ class GWorksheet:
eg: if header=4, row 5 will be the first with data.
"""
COLUMN_NAMES = {
+ 'filenumber': 'file number',
'url': 'link',
'archive': 'archive location',
'date': 'archive date',