Save to folders for S3 and GD. Google Drive (GD) storage

pull/35/head
Dave Mateer 2022-05-11 15:39:44 +01:00
rodzic b3599dee71
commit dbac5accbd
15 zmienionych plików z 469 dodań i 56 usunięć

Wyświetl plik

@ -7,4 +7,12 @@ INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
FACEBOOK_COOKIE=cookie: datr= xxxx
FACEBOOK_COOKIE=cookie: datr= xxxx
# Google Drive, Right click on folder, Get link, eg
# https://drive.google.com/drive/folders/1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X?usp=sharing
# we want: 1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X
GD_ROOT_FOLDER_ID=
# Remeber to share the folder with the service eg
# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com

3
.gitignore vendored
Wyświetl plik

@ -8,4 +8,5 @@ __pycache__/
anu.html
*.log
.pytest_cach
anon*
anon*

27
.vscode/launch.json vendored 100644
Wyświetl plik

@ -0,0 +1,27 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Test Hashing",
"type": "python",
"request": "launch",
"program": "auto_archive.py",
"console": "integratedTerminal",
"justMyCode": true,
// "args": ["--sheet","Test Hashing"]
// "args": ["--sheet","Test Hashing","--use-filenumber-as-directory"]
"args": ["--sheet","Test Hashing","--use-filenumber-as-directory", "--storage=gd"]
},
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

Wyświetl plik

@ -1,6 +1,6 @@
# auto-archiver
This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
## Setup
@ -14,7 +14,7 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```
DO_SPACES_REGION=
@ -23,8 +23,14 @@ DO_SPACES_KEY=
DO_SPACES_SECRET=
INTERNET_ARCHIVE_S3_KEY=
INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
FACEBOOK_COOKIE=
GD_ROOT_FOLDER_ID=
```
`.example.env` is an example of this file
Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
## Running
@ -93,3 +99,29 @@ graph TD
graph TD
A(BaseStorage) -->|parent of| B(S3Storage)
```
## Saving into Folders
To use a column from the spreadsheet called `File Number` eg SM001234 as a directory on the cloud storage, you need to pass in
```bash
python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory
```
## Google Drive
To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
```bash
python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory --storage='gd'
```
Note the you must use filenumber for Google Drive Storage.
## Telethon (Telegrams API Library)
Put your `anon.session` in the root, so that it doesn't stall and ask for authentication

Wyświetl plik

@ -14,6 +14,9 @@ from selenium.common.exceptions import TimeoutException
from storages import Storage
from utils import mkdir_if_not_exists
from selenium.webdriver.common.by import By
from loguru import logger
from selenium.common.exceptions import TimeoutException
@dataclass
class ArchiveResult:
@ -39,7 +42,7 @@ class Archiver(ABC):
return self.__class__.__name__
@abstractmethod
def download(self, url, check_if_exists=False): pass
def download(self, url, check_if_exists=False, filenumber=None): pass
def get_netloc(self, url):
return urlparse(url).netloc
@ -47,7 +50,8 @@ class Archiver(ABC):
def get_html_key(self, url):
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None):
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
<body>
<h2>Archived media from {self.name}</h2>
@ -61,18 +65,24 @@ class Archiver(ABC):
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = 'tmp/' + page_key
page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f:
f.write(page)
page_hash = self.get_hash(page_filename)
if filenumber != None:
logger.trace(f'filenumber for directory is {filenumber}')
page_key = filenumber + "/" + page_key
self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
page_cdn = self.storage.get_cdn_url(page_key)
return (page_cdn, page_hash, thumbnail)
def generate_media_page(self, urls, url, object):
# eg images in a tweet save to cloud storage
def generate_media_page(self, urls, url, object, filenumber=None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
@ -87,19 +97,30 @@ class Archiver(ABC):
filename = 'tmp/' + key
# eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f:
f.write(d.content)
if filenumber is not None:
logger.debug(f'filenumber for directory is {filenumber}')
key = filenumber + "/" + key
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
self.storage.upload(filename, key)
hash = self.get_hash(filename)
# eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
cdn_url = self.storage.get_cdn_url(key)
if thumbnail is None:
thumbnail = cdn_url
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber)
def get_key(self, filename):
"""
@ -119,15 +140,33 @@ class Archiver(ABC):
def get_hash(self, filename):
f = open(filename, "rb")
bytes = f.read() # read entire file as bytes
hash = hashlib.sha256(bytes)
# option to use SHA3_512 instead
# hash = hashlib.sha3_512(bytes)
f.close()
return hash.hexdigest()
def get_screenshot(self, url):
# eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png
# def get_screenshot(self, url, filenumber, storage="GD"):
def get_screenshot(self, url, filenumber):
key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key
# Accept cookies popup dismiss for ytdlp video
if 'facebook.com' in url:
try:
logger.debug(f'Trying fb click accept cookie popup for {url}')
self.driver.get("http://www.facebook.com")
foo = self.driver.find_element(By.XPATH,"//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies for url {url}')
try:
self.driver.get(url)
time.sleep(6)
@ -135,8 +174,14 @@ class Archiver(ABC):
logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename)
if filenumber is not None:
logger.debug(f'filenumber for directory is {filenumber}')
key = filenumber + "/" + key
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'})
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None):
@ -167,10 +212,9 @@ class Archiver(ABC):
thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname
cdn_url = self.storage.get_cdn_url(key)
self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)
cdn_urls.append(cdn_url)
if len(cdn_urls) == 0:

Wyświetl plik

@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult
class TelegramArchiver(Archiver):
name = "telegram"
def download(self, url, check_if_exists=False):
def download(self, url, check_if_exists=False, filenumber=None):
# detect URLs that we definitely cannot handle
if 't.me' != self.get_netloc(url):
return False
@ -27,7 +27,7 @@ class TelegramArchiver(Archiver):
if url[-8:] != "?embed=1":
url += "?embed=1"
screenshot = self.get_screenshot(url)
screenshot = self.get_screenshot(url, filenumber=filenumber)
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
@ -42,7 +42,7 @@ class TelegramArchiver(Archiver):
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
images += urls
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber)
time_elements = s.find_all('time')
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
@ -52,6 +52,9 @@ class TelegramArchiver(Archiver):
video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id)
if filenumber is not None:
key = filenumber + "/" + key
filename = 'tmp/' + key
cdn_url = self.storage.get_cdn_url(key)

Wyświetl plik

@ -41,20 +41,22 @@ class TelethonArchiver(Archiver):
media.append(post)
return media
def download(self, url, check_if_exists=False):
def download(self, url, check_if_exists=False, filenumber=None):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
if not len(matches):
return False
status = "success"
screenshot = self.get_screenshot(url)
screenshot = self.get_screenshot(url, filenumber)
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
with self.client.start():
matches = list(matches[0])
chat, post_id = matches[1], matches[2]
post_id = int(post_id)
try:
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
@ -65,9 +67,13 @@ class TelethonArchiver(Archiver):
if len(media_posts) > 1:
key = self.get_html_key(url)
cdn_url = self.storage.get_cdn_url(key)
if filenumber is not None:
key = filenumber + "/" + key
if check_if_exists and self.storage.exists(key):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
status = 'already archived'
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
@ -78,19 +84,26 @@ class TelethonArchiver(Archiver):
if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1]
if filenumber is not None:
key = filenumber + "/" + key
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
os.remove(filename)
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)), filenumber=filenumber)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'tmp/{key}')
key = filename.split('tmp/')[1].replace(" ", "")
if filenumber is not None:
key = filenumber + "/" + key
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
@ -99,5 +112,5 @@ class TelethonArchiver(Archiver):
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)), filenumber=filenumber)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)

Wyświetl plik

@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult
class TiktokArchiver(Archiver):
name = "tiktok"
def download(self, url, check_if_exists=False):
def download(self, url, check_if_exists=False, filenumber=None):
if 'tiktok.com' not in url:
return False
@ -54,11 +54,13 @@ class TiktokArchiver(Archiver):
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
hash=hash, screenshot=screenshot)
except tiktok_downloader.Except.InvalidUrl:
except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL'
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
return ArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
logger.warning(f'Other Tiktok error' + str(error))
return ArchiveResult(status=status)

Wyświetl plik

@ -1,6 +1,5 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger
import requests
from urllib.parse import urlparse
from .base_archiver import Archiver, ArchiveResult
@ -9,7 +8,8 @@ from .base_archiver import Archiver, ArchiveResult
class TwitterArchiver(Archiver):
name = "twitter"
def download(self, url, check_if_exists=False):
def download(self, url, check_if_exists=False, filenumber=None):
if 'twitter.com' != self.get_netloc(url):
return False
@ -24,11 +24,14 @@ class TwitterArchiver(Archiver):
try:
tweet = next(scr.get_items())
except:
logger.warning('wah wah')
except Exception as ex:
template = "TwitterArchiver cant get tweet and threw, which can happen if a media sensitive tweet. \n type: {0} occurred. \n arguments:{1!r}"
message = template.format(type(ex).__name__, ex.args)
logger.warning(message)
return False
if tweet.media is None:
logger.trace(f'No media found')
return False
urls = []
@ -45,8 +48,8 @@ class TwitterArchiver(Archiver):
else:
logger.warning(f"Could not get media URL of {media}")
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber)
screenshot = self.get_screenshot(url)
screenshot = self.get_screenshot(url, filenumber)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)

Wyświetl plik

@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from loguru import logger
class WaybackArchiver(Archiver):
name = "wayback"
@ -12,7 +14,7 @@ class WaybackArchiver(Archiver):
super(WaybackArchiver, self).__init__(storage, driver)
self.seen_urls = {}
def download(self, url, check_if_exists=False):
def download(self, url, check_if_exists=False, filenumber=None):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
@ -25,9 +27,11 @@ class WaybackArchiver(Archiver):
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}")
return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
logger.warning(f"Internet archive failed json \n {r.json()}")
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id']
@ -71,7 +75,7 @@ class WaybackArchiver(Archiver):
except:
title = "Could not get title"
screenshot = self.get_screenshot(url)
screenshot = self.get_screenshot(url, filenumber)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
self.seen_urls[url] = result
return result

Wyświetl plik

@ -15,7 +15,7 @@ class YoutubeDLArchiver(Archiver):
super().__init__(storage, driver)
self.fb_cookie = fb_cookie
def download(self, url, check_if_exists=False):
def download(self, url, check_if_exists=False, filenumber=None):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'www.facebook.com']:
logger.debug('Using Facebook cookie')
@ -27,13 +27,17 @@ class YoutubeDLArchiver(Archiver):
try:
info = ydl.extract_info(url, download=False)
except yt_dlp.utils.DownloadError:
# no video here
except yt_dlp.utils.DownloadError as e:
logger.debug(f'No video - Youtube normal control flow: {e}')
return False
except Exception as e:
logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}')
return False
if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
if 'twitter.com' in netloc:
if 'https://twitter.com/' in info['webpage_url']:
logger.info('Found https://twitter.com/ in the download url from Twitter')
@ -41,7 +45,6 @@ class YoutubeDLArchiver(Archiver):
logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
return False
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
@ -58,6 +61,9 @@ class YoutubeDLArchiver(Archiver):
key = self.get_key(filename)
if filenumber is not None:
key = filenumber + "/" + key
if self.storage.exists(key):
status = 'already archived'
cdn_url = self.storage.get_cdn_url(key)
@ -81,12 +87,19 @@ class YoutubeDLArchiver(Archiver):
if status != 'already archived':
key = self.get_key(filename)
cdn_url = self.storage.get_cdn_url(key)
if filenumber is not None:
key = filenumber + "/" + key
self.storage.upload(filename, key)
# filename ='tmp/sDE-qZdi8p8.webm'
# key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
cdn_url = self.storage.get_cdn_url(key)
hash = self.get_hash(filename)
screenshot = self.get_screenshot(url)
screenshot = self.get_screenshot(url, filenumber)
# get duration
duration = info.get('duration')

Wyświetl plik

@ -68,7 +68,7 @@ def expand_url(url):
return url
def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
@ -78,6 +78,9 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET')
)
gd_config = GDConfig(
root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'),
)
telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH')
@ -91,12 +94,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
gw = GWorksheet(wks, header_row=header, columns=columns)
if not gw.col_exists('url'):
logger.warning(
logger.info(
f'No "{columns["url"]}" column found, skipping worksheet {wks.title}')
continue
if not gw.col_exists('status'):
logger.warning(
logger.info(
f'No "{columns["status"]}" column found, skipping worksheet {wks.title}')
continue
@ -104,26 +107,30 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
s3_client = S3Storage(s3_config)
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelethonArchiver(s3_client, driver, telegram_config),
archivers.TelegramArchiver(s3_client, driver),
archivers.TiktokArchiver(s3_client, driver),
archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')),
archivers.TwitterArchiver(s3_client, driver),
archivers.WaybackArchiver(s3_client, driver)
]
gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
gd_client = GDStorage(gd_config)
# loop through rows in worksheet
for row in range(1 + header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
if usefilenumber:
filenumber = gw.get_cell(row, 'filenumber')
logger.debug(f'filenumber is {filenumber}')
if filenumber == "":
logger.warning(f'Logic error on row {row} with url {url} - the feature flag for usefilenumber is True, yet cant find a corresponding filenumber')
gw.set_cell(row, 'status', 'Missing filenumber')
continue
else:
# We will use this through the app to differentiate between where to save
filenumber = None
# make a new driver so each spreadsheet row is idempotent
options = webdriver.FirefoxOptions()
@ -134,24 +141,58 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
driver.set_window_size(1400, 2000)
# in seconds, telegram screenshots catch which don't come back
driver.set_page_load_timeout(120)
# client
storage_client = None
if storage == "s3":
storage_client = s3_client
elif storage == "gd":
storage_client = gd_client
else:
raise ValueError(f'Cant get storage_client {storage_client}')
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelethonArchiver(storage_client, driver, telegram_config),
archivers.TelegramArchiver(storage_client, driver),
archivers.TiktokArchiver(storage_client, driver),
archivers.YoutubeDLArchiver(storage_client, driver, os.getenv('FACEBOOK_COOKIE')),
archivers.TwitterArchiver(storage_client, driver),
archivers.WaybackArchiver(storage_client, driver)
]
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
try:
result = archiver.download(url, check_if_exists=True)
if usefilenumber:
# using filenumber to store in folders so not checking for existence of that url
result = archiver.download(url, check_if_exists=False, filenumber=filenumber)
else:
result = archiver.download(url, check_if_exists=True)
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
if result:
if result.status in ['success', 'already archived']:
# IA is a Success I believe - or do we want to display a logger warning for it?
if result.status in ['success', 'already archived', 'Internet Archive fallback']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
f'{archiver} succeeded on row {row}')
f'{archiver} succeeded on row {row}, url {url}')
break
# wayback has seen this url before so keep existing status
if "wayback: Internet Archive fallback" in result.status:
logger.success(
f'wayback has seen this url before so keep existing status on row {row}')
result.status = result.status.replace(' (duplicate)', '')
result.status = str(result.status) + " (duplicate)"
break
logger.warning(
f'{archiver} did not succeed on row {row}, final status: {result.status}')
f'{archiver} did not succeed on {row=}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
# get rid of driver so can reload on next row
@ -165,22 +206,34 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
@logger.catch
def main():
logger.debug(f'Passed args:{sys.argv}')
parser = argparse.ArgumentParser(
description='Automatically archive social media videos from a Google Sheets document')
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True)
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
parser.add_argument('--use-filenumber-as-directory', action=argparse.BooleanOptionalAction, dest='usefilenumber', \
help='Will save files into a subfolder on cloud storage which has the File Number eg SM3012')
parser.add_argument('--storage', action='store', dest='storage', default='s3', \
help='s3 or gd storage. Default is s3. NOTE GD storage supports only using filenumber')
for k, v in GWorksheet.COLUMN_NAMES.items():
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})')
args = parser.parse_args()
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
logger.info(f'Opening document {args.sheet} for header {args.header}')
logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber} and storage {args.storage}')
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
# args.filenumber is True (of type bool) when set or None when argument is not there
usefilenumber = False
if args.usefilenumber:
usefilenumber = True
mkdir_if_not_exists('tmp')
process_sheet(args.sheet, header=args.header, columns=config_columns)
process_sheet(args.sheet, usefilenumber, args.storage, args.header, config_columns)
shutil.rmtree('tmp')

Wyświetl plik

@ -17,5 +17,12 @@ class Storage(ABC):
def upload(self, filename: str, key: str, **kwargs):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)
# S3 requires an open file, GD only the filename
storage = type(self).__name__
if storage == "GDStorage":
self.uploadf(filename, key, **kwargs)
elif storage == "S3Storage":
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)
else:
raise ValueError('Cant get storage thrown from base_storage.py')

Wyświetl plik

@ -0,0 +1,202 @@
from loguru import logger
from .base_storage import Storage
from dataclasses import dataclass
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.oauth2 import service_account
import time
@dataclass
class GDConfig:
root_folder_id: str
class GDStorage(Storage):
def __init__(self, config: GDConfig):
self.root_folder_id = config.root_folder_id
SCOPES = ['https://www.googleapis.com/auth/drive']
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
self.service = build('drive', 'v3', credentials=creds)
def _get_path(self, key):
return self.folder + key
def get_cdn_url(self, key):
# only support files saved in a folders for GD
# S3 supports folder and all stored in the root
# key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg
foldername = key.split('/', 1)[0]
# eg twitter__media_asdf.jpg
filename = key.split('/', 1)[1]
logger.debug(f'Looking for {foldername} and filename: {filename} on GD')
# retry policy on Google Drive
try_again = True
counter = 1
folder_id = None
while try_again:
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
and name = '{foldername}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
for item in items:
logger.debug(f"found folder of {item['name']}")
folder_id= item['id']
try_again = False
if folder_id is None:
logger.debug(f'Cant find {foldername=} waiting and trying again {counter=}')
counter += 1
time.sleep(10)
if counter > 18:
raise ValueError(f'Cant find {foldername} and retried 18 times pausing 10seconds at a time which is 3 minutes')
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
# happens doing thumbnails
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'index.html'
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
# get id of the sub folder
results = self.service.files().list(q=f"'{folder_id}' in parents \
and mimeType='application/vnd.google-apps.folder' \
and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
filename = None
for item in items:
folder_id = item['id']
filename = b
if filename is None:
raise ValueError(f'Problem finding sub folder {a}')
# get id of file inside folder (or sub folder)
results = self.service.files().list(q=f"'{folder_id}' in parents \
and name = '{filename}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
file_id = None
for item in items:
logger.debug(f"found file of {item['name']}")
file_id= item['id']
if file_id is None:
raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
return foo
def exists(self, key):
# Not implemented yet
# Google drive will accept duplicate named filenames as it is stored as a different fileid
# try:
# self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
# return True
# except ClientError:
# return False
return False
def uploadf(self, file, key, **kwargs):
# split on first occurance of /
# eg SM0005
foldername = key.split('/', 1)[0]
# eg twitter__media_asdf.jpg
filename = key.split('/', 1)[1]
# does folder eg SM0005 exist already inside parent of Files auto-archiver
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
and mimeType='application/vnd.google-apps.folder' \
and name = '{foldername}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {foldername} which should never happen, but continuing anyway')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
folder_id_to_upload_to = item['id']
if folder_id_to_upload_to is None:
logger.debug(f'Creating new folder {foldername}')
file_metadata = {
'name': [foldername],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [self.root_folder_id]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
folder_id_to_upload_to = gd_file.get('id')
# check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg'
# happens doing thumbnails
# will always return a and a blank b even if there is nothing to split
# https://stackoverflow.com/a/38149500/26086
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'out1.jpg'
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \
and mimeType='application/vnd.google-apps.folder' \
and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
sub_folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {a} which should never happen')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
sub_folder_id_to_upload_to = item['id']
if sub_folder_id_to_upload_to is None:
# create new folder
file_metadata = {
'name': [a],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [folder_id_to_upload_to]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
sub_folder_id_to_upload_to = gd_file.get('id')
filename = b
folder_id_to_upload_to = sub_folder_id_to_upload_to
# back to normal control flow
# else:
# upload file to gd
file_metadata = {
# 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
'name': [filename],
'parents': [folder_id_to_upload_to]
}
media = MediaFileUpload(file, resumable=True)
gd_file = self.service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()

Wyświetl plik

@ -9,6 +9,7 @@ class GWorksheet:
eg: if header=4, row 5 will be the first with data.
"""
COLUMN_NAMES = {
'filenumber': 'file number',
'url': 'link',
'archive': 'archive location',
'date': 'archive date',