Merge branch 'dev' into refactor-configs

pull/33/head
msramalho 2022-06-02 17:30:47 +02:00
commit 10f03cb888
15 zmienionych plików z 512 dodań i 43 usunięć

Wyświetl plik

@ -7,4 +7,12 @@ INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
FACEBOOK_COOKIE=cookie: datr= xxxx
FACEBOOK_COOKIE=cookie: datr= xxxx
# Google Drive, Right click on folder, Get link:
# https://drive.google.com/drive/folders/123456789987654321abcdefghijk?usp=sharing
# we want: 123456789987654321abcdefghijk
# Remember to share the folder with the service email
# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com
GD_ROOT_FOLDER_ID=

Wyświetl plik

@ -17,8 +17,10 @@ selenium = "*"
snscrape = "*"
yt-dlp = "*"
telethon = "*"
[dev-packages]
google-api-python-client = "*"
google-auth-httplib2 = "*"
google-auth-oauthlib = "*"
oauth2client = "*"
[requires]
python_version = "3.9"

109
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e27ea0a6fdf6e588c14fbb90af45f784b9e55a9b986a3b50770490648ba96720"
"sha256": "25b858227d74cc232bba525d34dcf30f15d18d535a6e9c59555e85a0a2bd8c61"
},
"pipfile-spec": 6,
"requires": {
@ -141,18 +141,19 @@
},
"cachetools": {
"hashes": [
"sha256:486471dfa8799eb7ec503a8059e263db000cdda20075ce5e48903087f79d5fd6",
"sha256:8fecd4203a38af17928be7b90689d8083603073622229ca7077b72d8e5a976e4"
"sha256:4ebbd38701cdfd3603d1f751d851ed248ab4570929f2d8a7ce69e30c420b141c",
"sha256:8b3b8fa53f564762e5b221e9896798951e7f915513abf2ba072ce0f07f3f5a98"
],
"markers": "python_version ~= '3.7'",
"version": "==5.0.0"
"version": "==5.1.0"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
],
"version": "==2021.10.8"
"markers": "python_version >= '3.6'",
"version": "==2022.5.18.1"
},
"cffi": {
"hashes": [
@ -277,11 +278,11 @@
},
"filelock": {
"hashes": [
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
"sha256:f8314284bfffbdcfa0ff3d7992b023d4c628ced6feb957351d4c48d059f56bc0"
"sha256:b795f1b42a61bbf8ec7113c341dad679d772567b936fbd1bf43c9a238e673e20",
"sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6"
],
"markers": "python_version >= '3.7'",
"version": "==3.6.0"
"version": "==3.7.0"
},
"flask": {
"hashes": [
@ -298,6 +299,22 @@
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.2"
},
"google-api-core": {
"hashes": [
"sha256:065bb8e11c605fd232707ae50963dc1c8af5b3c95b4568887515985e6c1156b3",
"sha256:1b9f59236ce1bae9a687c1d4f22957e79a2669e53d032893f6bf0fca54f6931d"
],
"markers": "python_version >= '3.6'",
"version": "==2.8.0"
},
"google-api-python-client": {
"hashes": [
"sha256:4527f7b8518a795624ab68da412d55628f83b98c67dd6a5d6edf725454f8b30b",
"sha256:600c43d7eac6e3536fdcad1d14ba9ee503edf4c7db0bd827e791bbf03b9f1330"
],
"index": "pypi",
"version": "==2.48.0"
},
"google-auth": {
"hashes": [
"sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312",
@ -306,14 +323,30 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.6.6"
},
"google-auth-httplib2": {
"hashes": [
"sha256:31e49c36c6b5643b57e82617cb3e021e3e1d2df9da63af67252c02fa9c1f4a10",
"sha256:a07c39fd632becacd3f07718dfd6021bf396978f03ad3ce4321d060015cc30ac"
],
"index": "pypi",
"version": "==0.1.0"
},
"google-auth-oauthlib": {
"hashes": [
"sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
"sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
],
"markers": "python_version >= '3.6'",
"index": "pypi",
"version": "==0.5.1"
},
"googleapis-common-protos": {
"hashes": [
"sha256:6b5ee59dc646eb61a8eb65ee1db186d3df6687c8804830024f32573298bca19b",
"sha256:ddcd955b5bb6589368f659fa475373faa1ed7d09cde5ba25e88513d87007e174"
],
"markers": "python_version >= '3.6'",
"version": "==1.56.1"
},
"gspread": {
"hashes": [
"sha256:319766d90db05056293f7ee0ad2b35503a1a40683a75897a2922398cd2016283",
@ -330,6 +363,14 @@
"markers": "python_version >= '3.6'",
"version": "==0.13.0"
},
"httplib2": {
"hashes": [
"sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585",
"sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.20.4"
},
"idna": {
"hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
@ -499,6 +540,14 @@
"markers": "python_version >= '3.5' and python_version < '4'",
"version": "==1.45.1"
},
"oauth2client": {
"hashes": [
"sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac",
"sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6"
],
"index": "pypi",
"version": "==4.1.3"
},
"oauthlib": {
"hashes": [
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
@ -515,6 +564,36 @@
"markers": "python_version >= '3.6'",
"version": "==1.1.0"
},
"protobuf": {
"hashes": [
"sha256:06059eb6953ff01e56a25cd02cca1a9649a75a7e65397b5b9b4e929ed71d10cf",
"sha256:097c5d8a9808302fb0da7e20edf0b8d4703274d140fd25c5edabddcde43e081f",
"sha256:284f86a6207c897542d7e956eb243a36bb8f9564c1742b253462386e96c6b78f",
"sha256:32ca378605b41fd180dfe4e14d3226386d8d1b002ab31c969c366549e66a2bb7",
"sha256:3cc797c9d15d7689ed507b165cd05913acb992d78b379f6014e013f9ecb20996",
"sha256:62f1b5c4cd6c5402b4e2d63804ba49a327e0c386c99b1675c8a0fefda23b2067",
"sha256:69ccfdf3657ba59569c64295b7d51325f91af586f8d5793b734260dfe2e94e2c",
"sha256:6f50601512a3d23625d8a85b1638d914a0970f17920ff39cec63aaef80a93fb7",
"sha256:7403941f6d0992d40161aa8bb23e12575637008a5a02283a930addc0508982f9",
"sha256:755f3aee41354ae395e104d62119cb223339a8f3276a0cd009ffabfcdd46bb0c",
"sha256:77053d28427a29987ca9caf7b72ccafee011257561259faba8dd308fda9a8739",
"sha256:7e371f10abe57cee5021797126c93479f59fccc9693dafd6bd5633ab67808a91",
"sha256:9016d01c91e8e625141d24ec1b20fed584703e527d28512aa8c8707f105a683c",
"sha256:9be73ad47579abc26c12024239d3540e6b765182a91dbc88e23658ab71767153",
"sha256:adc31566d027f45efe3f44eeb5b1f329da43891634d61c75a5944e9be6dd42c9",
"sha256:adfc6cf69c7f8c50fd24c793964eef18f0ac321315439d94945820612849c388",
"sha256:af0ebadc74e281a517141daad9d0f2c5d93ab78e9d455113719a45a49da9db4e",
"sha256:cb29edb9eab15742d791e1025dd7b6a8f6fcb53802ad2f6e3adcb102051063ab",
"sha256:cd68be2559e2a3b84f517fb029ee611546f7812b1fdd0aa2ecc9bc6ec0e4fdde",
"sha256:cdee09140e1cd184ba9324ec1df410e7147242b94b5f8b0c64fc89e38a8ba531",
"sha256:db977c4ca738dd9ce508557d4fce0f5aebd105e158c725beec86feb1f6bc20d8",
"sha256:dd5789b2948ca702c17027c84c2accb552fc30f4622a98ab5c51fcfe8c50d3e7",
"sha256:e250a42f15bf9d5b09fe1b293bdba2801cd520a9f5ea2d7fb7536d4441811d20",
"sha256:ff8d8fa42675249bb456f5db06c00de6c2f4c27a065955917b28c4f15978b9c3"
],
"markers": "python_version >= '3.7'",
"version": "==3.20.1"
},
"pyaes": {
"hashes": [
"sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
@ -749,6 +828,14 @@
"markers": "python_version >= '3.5'",
"version": "==0.9.2"
},
"uritemplate": {
"hashes": [
"sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
"sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
],
"markers": "python_version >= '3.6'",
"version": "==4.1.1"
},
"urllib3": {
"hashes": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",

Wyświetl plik

@ -1,6 +1,6 @@
# auto-archiver
This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
## Setup
@ -14,7 +14,7 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```
DO_SPACES_REGION=
@ -23,8 +23,14 @@ DO_SPACES_KEY=
DO_SPACES_SECRET=
INTERNET_ARCHIVE_S3_KEY=
INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
FACEBOOK_COOKIE=
GD_ROOT_FOLDER_ID=
```
`.example.env` is an example of this file
Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
## Running
@ -93,4 +99,25 @@ graph TD
graph TD
A(BaseStorage) -->|parent of| B(S3Storage)
C(BaseStorage) -->|parent of| C(LocalStorage)
A(BaseStorage) -->|parent of| C(GoogleDriveStorage)
```
## Saving into Subfolders
You can have a column in the spreadsheet for the argument `--col-subfolder` that is passed to the storage and can specify a subfolder to put the archived link into.
## Google Drive
To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
```bash
python auto_archive.py --sheet 'Sheet Name' --storage='gd'
```
## Telethon (Telegrams API Library)
Put your `anon.session` in the root, so that it doesn't stall and ask for authentication

Wyświetl plik

@ -14,6 +14,10 @@ from selenium.common.exceptions import TimeoutException
from storages import Storage
from utils import mkdir_if_not_exists
from selenium.webdriver.common.by import By
from loguru import logger
from selenium.common.exceptions import TimeoutException
@dataclass
class ArchiveResult:
@ -47,6 +51,7 @@ class Archiver(ABC):
def get_html_key(self, url):
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
<body>
@ -70,8 +75,11 @@ class Archiver(ABC):
self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
page_cdn = self.storage.get_cdn_url(page_key)
return (page_cdn, page_hash, thumbnail)
# eg images in a tweet save to cloud storage
def generate_media_page(self, urls, url, object):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
@ -87,12 +95,19 @@ class Archiver(ABC):
filename = Storage.TMP_FOLDER + key
# eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f:
f.write(d.content)
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
self.storage.upload(filename, key)
hash = self.get_hash(filename)
# eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
cdn_url = self.storage.get_cdn_url(key)
if thumbnail is None:
@ -119,7 +134,11 @@ class Archiver(ABC):
def get_hash(self, filename):
f = open(filename, "rb")
bytes = f.read() # read entire file as bytes
# TODO: customizable hash
hash = hashlib.sha256(bytes)
# option to use SHA3_512 instead
# hash = hashlib.sha3_512(bytes)
f.close()
return hash.hexdigest()
@ -128,6 +147,19 @@ class Archiver(ABC):
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = Storage.TMP_FOLDER + key
# Accept cookies popup dismiss for ytdlp video
if 'facebook.com' in url:
try:
logger.debug(f'Trying fb click accept cookie popup for {url}')
self.driver.get("http://www.facebook.com")
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies for url {url}')
try:
self.driver.get(url)
time.sleep(6)
@ -137,6 +169,7 @@ class Archiver(ABC):
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'})
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None):
@ -167,10 +200,9 @@ class Archiver(ABC):
thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname
cdn_url = self.storage.get_cdn_url(key)
self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)
cdn_urls.append(cdn_url)
if len(cdn_urls) == 0:

Wyświetl plik

@ -6,6 +6,7 @@ from loguru import logger
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError
from configs import TelegramConfig
@ -43,15 +44,21 @@ class TelethonArchiver(Archiver):
status = "success"
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
with self.client.start():
matches = list(matches[0])
chat, post_id = matches[1], matches[2]
post_id = int(post_id)
try:
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
logger.warning(f'Could not fetch telegram {url} possibly it\'s private: {e}')
logger.error(f'Could not fetch telegram {url} possibly it\'s private: {e}')
return False
except ChannelInvalidError as e:
# TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819
logger.error(f'Could not fetch telegram {url} possibly it\'s private or not displayable in : {e}')
return False
media_posts = self._get_media_posts_in_group(chat, post)
@ -60,9 +67,10 @@ class TelethonArchiver(Archiver):
if len(media_posts) > 1:
key = self.get_html_key(url)
cdn_url = self.storage.get_cdn_url(key)
if check_if_exists and self.storage.exists(key):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
status = 'already archived'
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)

Wyświetl plik

@ -54,11 +54,13 @@ class TiktokArchiver(Archiver):
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
hash=hash, screenshot=screenshot)
except tiktok_downloader.Except.InvalidUrl:
except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL'
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
return ArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
logger.warning(f'Other Tiktok error' + str(error))
return ArchiveResult(status=status)

Wyświetl plik

@ -1,6 +1,5 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger
import requests
from urllib.parse import urlparse
from .base_archiver import Archiver, ArchiveResult
@ -10,13 +9,14 @@ class TwitterArchiver(Archiver):
name = "twitter"
def download(self, url, check_if_exists=False):
if 'twitter.com' != self.get_netloc(url):
return False
tweet_id = urlparse(url).path.split('/')
if 'status' in tweet_id:
i = tweet_id.index('status')
tweet_id = tweet_id[i+1]
tweet_id = tweet_id[i + 1]
else:
return False
@ -24,11 +24,12 @@ class TwitterArchiver(Archiver):
try:
tweet = next(scr.get_items())
except:
logger.warning('wah wah')
except Exception as ex:
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
return False
if tweet.media is None:
logger.trace(f'No media found')
return False
urls = []

Wyświetl plik

@ -5,6 +5,9 @@ from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import WaybackConfig
from loguru import logger
class WaybackArchiver(Archiver):
name = "wayback"
@ -26,9 +29,11 @@ class WaybackArchiver(Archiver):
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}")
return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
logger.warning(f"Internet archive failed json \n {r.json()}")
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id']

Wyświetl plik

@ -7,6 +7,7 @@ from loguru import logger
from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class YoutubeDLArchiver(Archiver):
name = "youtube_dl"
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
@ -27,13 +28,17 @@ class YoutubeDLArchiver(Archiver):
try:
info = ydl.extract_info(url, download=False)
except yt_dlp.utils.DownloadError:
# no video here
except yt_dlp.utils.DownloadError as e:
logger.debug(f'No video - Youtube normal control flow: {e}')
return False
except Exception as e:
logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}')
return False
if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media")
if 'twitter.com' in netloc:
if 'https://twitter.com/' in info['webpage_url']:
logger.info('Found https://twitter.com/ in the download url from Twitter')
@ -41,7 +46,6 @@ class YoutubeDLArchiver(Archiver):
logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
return False
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
@ -81,10 +85,12 @@ class YoutubeDLArchiver(Archiver):
if status != 'already archived':
key = self.get_key(filename)
cdn_url = self.storage.get_cdn_url(key)
self.storage.upload(filename, key)
# filename ='tmp/sDE-qZdi8p8.webm'
# key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
cdn_url = self.storage.get_cdn_url(key)
hash = self.get_hash(filename)
screenshot = self.get_screenshot(url)
@ -102,9 +108,9 @@ class YoutubeDLArchiver(Archiver):
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
if 'timestamp' in info else \
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
if 'upload_date' in info and info['upload_date'] is not None else \
None
None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)

Wyświetl plik

@ -9,6 +9,11 @@ import traceback
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
from utils import GWorksheet, mkdir_if_not_exists, expand_url
from configs import Config
import archivers
from storages import S3Storage, S3Config
from storages.gd_storage import GDConfig, GDStorage
from utils import GWorksheet, mkdir_if_not_exists
import sys
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
@ -56,6 +61,25 @@ def update_sheet(gw, row, result: ArchiveResult):
def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
sh = c.gsheets_client.open(sheet)
def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
s3_config = S3Config(
bucket=os.getenv('DO_BUCKET'),
region=os.getenv('DO_SPACES_REGION'),
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET')
)
gd_config = GDConfig(
root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'),
)
telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH')
)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):
logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}')
@ -75,16 +99,22 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/')
storage = c.get_storage()
gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
gd_client = GDStorage(gd_config)
# loop through rows in worksheet
for row in range(1 + header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
subfolder = gw.get_cell_or_default(row, 'subfolder')
# make a new driver so each spreadsheet row is idempotent
c.recreate_webdriver()
@ -98,16 +128,35 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]
storage_client = None
if storage == "s3":
storage_client = s3_client
elif storage == "gd":
storage_client = gd_client
else:
raise ValueError(f'Cant get storage_client {storage_client}')
storage_client.update_properties(subfolder=subfolder)
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
try:
result = archiver.download(url, check_if_exists=True)
except KeyboardInterrupt:
logger.warning("caught interrupt")
gw.set_cell(row, 'status', '')
driver.quit()
exit()
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
if result:
# IA is a Success I believe - or do we want to display a logger warning for it?
if result.status in ['success', 'already archived', 'Internet Archive fallback']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
f'{archiver} succeeded on row {row}, url {url}')
if result.status in ['success', 'already archived']:
result.status = f"{archiver.name}: {result.status}"
logger.success(f'{archiver} succeeded on row {row}')
@ -115,6 +164,21 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
result.status = f"{archiver.name}: {result.status}"
# wayback has seen this url before so keep existing status
if "wayback: Internet Archive fallback" in result.status:
logger.success(
f'wayback has seen this url before so keep existing status on row {row}')
result.status = result.status.replace(' (duplicate)', '')
result.status = str(result.status) + " (duplicate)"
break
logger.warning(
f'{archiver} did not succeed on {row=}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
# get rid of driver so can reload on next row
driver.quit()
if result:
update_sheet(gw, row, result)
else:
@ -129,12 +193,25 @@ def main():
c.parse()
logger.info(f'Opening document {c.sheet} for header {c.header}')
parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"})
for k, v in GWorksheet.COLUMN_NAMES.items():
help = f"the name of the column to fill with {k} (defaults={v})"
if k == "subfolder":
help = f"the name of the column to read the {k} from (defaults={v})"
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help)
mkdir_if_not_exists(c.tmp_folder)
process_sheet(c, c.sheet, header=c.header, columns=c.column_names)
shutil.rmtree(c.tmp_folder)
c.destroy_webdriver()
logger.info(f'Opening document {args.sheet} for header {args.header}')
mkdir_if_not_exists('tmp')
process_sheet(args.sheet, header=args.header, columns=config_columns)
shutil.rmtree('tmp')
if __name__ == '__main__':
main()

Wyświetl plik

@ -1,5 +1,6 @@
from loguru import logger
from abc import ABC, abstractmethod
from pathlib import Path
class Storage(ABC):
@ -20,3 +21,25 @@ class Storage(ABC):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)
def update_properties(self, **kwargs):
"""
method used to update general properties that some children may use
and others not, but that all can call
"""
for k, v in kwargs.items():
if k in self.get_allowed_properties():
setattr(self, k, v)
else:
logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
def get_allowed_properties(self):
"""
child classes should specify which properties they allow to be set
"""
return set(["subfolder"])
def clean_path(self, folder, default="", add_forward_slash=True):
if folder is None or type(folder) != str or len(folder.strip()) == 0:
return default
return str(Path(folder)) + ("/" if add_forward_slash else "")

Wyświetl plik

@ -0,0 +1,187 @@
from loguru import logger
from .base_storage import Storage
from dataclasses import dataclass
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.oauth2 import service_account
import time
@dataclass
class GDConfig:
root_folder_id: str
class GDStorage(Storage):
DEFAULT_UPLOAD_FOLDER_NAME = "default"
def __init__(self, config: GDConfig):
self.root_folder_id = config.root_folder_id
SCOPES = ['https://www.googleapis.com/auth/drive']
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
self.service = build('drive', 'v3', credentials=creds)
def get_cdn_url(self, key):
"""
only support files saved in a folder for GD
S3 supports folder and all stored in the root
"""
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
filename = key
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
# retry policy on Google Drive
try_again = True
counter = 1
folder_id = None
while try_again:
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
results = self.service.files().list(
q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
for item in items:
logger.debug(f"found folder of {item['name']}")
folder_id = item['id']
try_again = False
if folder_id is None:
logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}')
counter += 1
time.sleep(10)
if counter > 18:
raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes')
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
# happens doing thumbnails
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'index.html'
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
# get id of the sub folder
results = self.service.files().list(
q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
filename = None
for item in items:
folder_id = item['id']
filename = b
if filename is None:
raise ValueError(f'Problem finding sub folder {a}')
# get id of file inside folder (or sub folder)
results = self.service.files().list(
q=f"'{folder_id}' in parents and name = '{filename}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
file_id = None
for item in items:
logger.debug(f"found file of {item['name']}")
file_id = item['id']
if file_id is None:
raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
return foo
def exists(self, _key):
# TODO: How to check for google drive, as it accepts different names
return False
def uploadf(self, file, key, **_kwargs):
logger.debug(f"before {self.subfolder=}")
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
filename = key
logger.debug(f"after {self.subfolder=}")
# does folder eg SM0005 exist already inside parent of Files auto-archiver
results = self.service.files().list(
q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
folder_id_to_upload_to = item['id']
if folder_id_to_upload_to is None:
logger.debug(f'Creating new folder {self.subfolder}')
file_metadata = {
'name': [self.subfolder],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [self.root_folder_id]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
folder_id_to_upload_to = gd_file.get('id')
# check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails
# will always return a and a blank b even if there is nothing to split
# https://stackoverflow.com/a/38149500/26086
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'out1.jpg'
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
results = self.service.files().list(
q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
sub_folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {a} which should never happen')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
sub_folder_id_to_upload_to = item['id']
if sub_folder_id_to_upload_to is None:
# create new folder
file_metadata = {
'name': [a],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [folder_id_to_upload_to]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
sub_folder_id_to_upload_to = gd_file.get('id')
filename = b
folder_id_to_upload_to = sub_folder_id_to_upload_to
# back to normal control flow
# upload file to gd
file_metadata = {
'name': [filename],
'parents': [folder_id_to_upload_to]
}
media = MediaFileUpload(file, resumable=True)
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
def upload(self, filename: str, key: str, **kwargs):
# GD only requires the filename not a file reader
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
self.uploadf(filename, key, **kwargs)

Wyświetl plik

@ -5,6 +5,8 @@ import boto3
from botocore.errorfactory import ClientError
from .base_storage import Storage
from dataclasses import dataclass
from loguru import logger
@dataclass
@ -26,20 +28,11 @@ class S3Storage(Storage):
def __init__(self, config: S3Config):
self.bucket = config.bucket
self.region = config.region
self.folder = self.clean_path(config.folder)
self.private = config.private
self.cdn_url = config.cdn_url
self.key_path = config.key_path
if config.no_folder:
self.folder = ""
else:
self.folder = config.folder
if len(self.folder) and self.folder[-1] != '/':
self.folder += '/'
if self.key_path == "random":
self.key_dict = {} # key => randomKey
self.s3 = boto3.client(
's3',
region_name=config.region,
@ -62,6 +55,7 @@ class S3Storage(Storage):
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
final_key = self.key_dict[key]
return self.folder + final_key
return self.folder + self.clean_path(self.subfolder) + key
def get_cdn_url(self, key):
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
@ -74,9 +68,9 @@ class S3Storage(Storage):
return False
def uploadf(self, file, key, **kwargs):
logger.debug(f'[S3 storage] uploading {file=}, {key=}')
if self.private:
extra_args = kwargs.get("extra_args", {})
else:
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)

Wyświetl plik

@ -10,6 +10,7 @@ class GWorksheet:
"""
COLUMN_NAMES = {
'url': 'link',
'subfolder': 'sub folder',
'archive': 'archive location',
'date': 'archive date',
'status': 'archive status',
@ -71,6 +72,15 @@ class GWorksheet:
return ''
return row[col_index]
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False):
"""
return self.get_cell or default value on error (eg: column is missing)
"""
try:
return self.get_cell(row, col, fresh)
except:
return default
def set_cell(self, row: int, col: str, val):
# row is 1-based
col_index = self._col_index(col) + 1