Save to folders for S3 and GD. Google Drive (GD) storage

pull/35/head
Dave Mateer 2022-05-11 15:39:44 +01:00
rodzic b3599dee71
commit dbac5accbd
15 zmienionych plików z 469 dodań i 56 usunięć

Wyświetl plik

@ -8,3 +8,11 @@ TELEGRAM_API_ID=
TELEGRAM_API_HASH= TELEGRAM_API_HASH=
FACEBOOK_COOKIE=cookie: datr= xxxx FACEBOOK_COOKIE=cookie: datr= xxxx
# Google Drive, Right click on folder, Get link, eg
# https://drive.google.com/drive/folders/1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X?usp=sharing
# we want: 1ljwzoAdKdJMJzRW9gPHDC8fkRykVH83X
GD_ROOT_FOLDER_ID=
# Remeber to share the folder with the service eg
# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com

1
.gitignore vendored
Wyświetl plik

@ -8,4 +8,5 @@ __pycache__/
anu.html anu.html
*.log *.log
.pytest_cach .pytest_cach
anon* anon*

27
.vscode/launch.json vendored 100644
Wyświetl plik

@ -0,0 +1,27 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Test Hashing",
"type": "python",
"request": "launch",
"program": "auto_archive.py",
"console": "integratedTerminal",
"justMyCode": true,
// "args": ["--sheet","Test Hashing"]
// "args": ["--sheet","Test Hashing","--use-filenumber-as-directory"]
"args": ["--sheet","Test Hashing","--use-filenumber-as-directory", "--storage=gd"]
},
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

Wyświetl plik

@ -1,6 +1,6 @@
# auto-archiver # auto-archiver
This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis. This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
## Setup ## Setup
@ -14,7 +14,7 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables: A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
``` ```
DO_SPACES_REGION= DO_SPACES_REGION=
@ -23,8 +23,14 @@ DO_SPACES_KEY=
DO_SPACES_SECRET= DO_SPACES_SECRET=
INTERNET_ARCHIVE_S3_KEY= INTERNET_ARCHIVE_S3_KEY=
INTERNET_ARCHIVE_S3_SECRET= INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
FACEBOOK_COOKIE=
GD_ROOT_FOLDER_ID=
``` ```
`.example.env` is an example of this file
Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
## Running ## Running
@ -93,3 +99,29 @@ graph TD
graph TD graph TD
A(BaseStorage) -->|parent of| B(S3Storage) A(BaseStorage) -->|parent of| B(S3Storage)
``` ```
## Saving into Folders
To use a column from the spreadsheet called `File Number` eg SM001234 as a directory on the cloud storage, you need to pass in
```bash
python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory
```
## Google Drive
To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
```bash
python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory --storage='gd'
```
Note the you must use filenumber for Google Drive Storage.
## Telethon (Telegrams API Library)
Put your `anon.session` in the root, so that it doesn't stall and ask for authentication

Wyświetl plik

@ -14,6 +14,9 @@ from selenium.common.exceptions import TimeoutException
from storages import Storage from storages import Storage
from utils import mkdir_if_not_exists from utils import mkdir_if_not_exists
from selenium.webdriver.common.by import By
from loguru import logger
from selenium.common.exceptions import TimeoutException
@dataclass @dataclass
class ArchiveResult: class ArchiveResult:
@ -39,7 +42,7 @@ class Archiver(ABC):
return self.__class__.__name__ return self.__class__.__name__
@abstractmethod @abstractmethod
def download(self, url, check_if_exists=False): pass def download(self, url, check_if_exists=False, filenumber=None): pass
def get_netloc(self, url): def get_netloc(self, url):
return urlparse(url).netloc return urlparse(url).netloc
@ -47,7 +50,8 @@ class Archiver(ABC):
def get_html_key(self, url): def get_html_key(self, url):
return self.get_key(urlparse(url).path.replace("/", "_") + ".html") return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None):
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head> page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
<body> <body>
<h2>Archived media from {self.name}</h2> <h2>Archived media from {self.name}</h2>
@ -61,18 +65,24 @@ class Archiver(ABC):
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = 'tmp/' + page_key page_filename = 'tmp/' + page_key
page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f: with open(page_filename, "w") as f:
f.write(page) f.write(page)
page_hash = self.get_hash(page_filename) page_hash = self.get_hash(page_filename)
if filenumber != None:
logger.trace(f'filenumber for directory is {filenumber}')
page_key = filenumber + "/" + page_key
self.storage.upload(page_filename, page_key, extra_args={ self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'}) 'ACL': 'public-read', 'ContentType': 'text/html'})
page_cdn = self.storage.get_cdn_url(page_key)
return (page_cdn, page_hash, thumbnail) return (page_cdn, page_hash, thumbnail)
def generate_media_page(self, urls, url, object): # eg images in a tweet save to cloud storage
def generate_media_page(self, urls, url, object, filenumber=None):
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
} }
@ -87,19 +97,30 @@ class Archiver(ABC):
filename = 'tmp/' + key filename = 'tmp/' + key
# eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
d = requests.get(media_url, headers=headers) d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
f.write(d.content) f.write(d.content)
if filenumber is not None:
logger.debug(f'filenumber for directory is {filenumber}')
key = filenumber + "/" + key
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
self.storage.upload(filename, key) self.storage.upload(filename, key)
hash = self.get_hash(filename) hash = self.get_hash(filename)
# eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
if thumbnail is None: if thumbnail is None:
thumbnail = cdn_url thumbnail = cdn_url
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail) return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber)
def get_key(self, filename): def get_key(self, filename):
""" """
@ -119,15 +140,33 @@ class Archiver(ABC):
def get_hash(self, filename): def get_hash(self, filename):
f = open(filename, "rb") f = open(filename, "rb")
bytes = f.read() # read entire file as bytes bytes = f.read() # read entire file as bytes
hash = hashlib.sha256(bytes) hash = hashlib.sha256(bytes)
# option to use SHA3_512 instead
# hash = hashlib.sha3_512(bytes)
f.close() f.close()
return hash.hexdigest() return hash.hexdigest()
def get_screenshot(self, url): # eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png
# def get_screenshot(self, url, filenumber, storage="GD"):
def get_screenshot(self, url, filenumber):
key = self.get_key(urlparse(url).path.replace( key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key filename = 'tmp/' + key
# Accept cookies popup dismiss for ytdlp video
if 'facebook.com' in url:
try:
logger.debug(f'Trying fb click accept cookie popup for {url}')
self.driver.get("http://www.facebook.com")
foo = self.driver.find_element(By.XPATH,"//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies for url {url}')
try: try:
self.driver.get(url) self.driver.get(url)
time.sleep(6) time.sleep(6)
@ -135,8 +174,14 @@ class Archiver(ABC):
logger.info("TimeoutException loading page for screenshot") logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename) self.driver.save_screenshot(filename)
if filenumber is not None:
logger.debug(f'filenumber for directory is {filenumber}')
key = filenumber + "/" + key
self.storage.upload(filename, key, extra_args={ self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'}) 'ACL': 'public-read', 'ContentType': 'image/png'})
return self.storage.get_cdn_url(key) return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None): def get_thumbnails(self, filename, key, duration=None):
@ -167,10 +212,9 @@ class Archiver(ABC):
thumbnail_filename = thumbnails_folder + fname thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname key = key_folder + fname
cdn_url = self.storage.get_cdn_url(key)
self.storage.upload(thumbnail_filename, key) self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)
cdn_urls.append(cdn_url) cdn_urls.append(cdn_url)
if len(cdn_urls) == 0: if len(cdn_urls) == 0:

Wyświetl plik

@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult
class TelegramArchiver(Archiver): class TelegramArchiver(Archiver):
name = "telegram" name = "telegram"
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False, filenumber=None):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
if 't.me' != self.get_netloc(url): if 't.me' != self.get_netloc(url):
return False return False
@ -27,7 +27,7 @@ class TelegramArchiver(Archiver):
if url[-8:] != "?embed=1": if url[-8:] != "?embed=1":
url += "?embed=1" url += "?embed=1"
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url, filenumber=filenumber)
t = requests.get(url, headers=headers) t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser') s = BeautifulSoup(t.content, 'html.parser')
@ -42,7 +42,7 @@ class TelegramArchiver(Archiver):
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])] urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
images += urls images += urls
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content))) page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber)
time_elements = s.find_all('time') time_elements = s.find_all('time')
timestamp = time_elements[0].get('datetime') if len(time_elements) else None timestamp = time_elements[0].get('datetime') if len(time_elements) else None
@ -52,6 +52,9 @@ class TelegramArchiver(Archiver):
video_id = video_url.split('/')[-1].split('?')[0] video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id) key = self.get_key(video_id)
if filenumber is not None:
key = filenumber + "/" + key
filename = 'tmp/' + key filename = 'tmp/' + key
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)

Wyświetl plik

@ -41,20 +41,22 @@ class TelethonArchiver(Archiver):
media.append(post) media.append(post)
return media return media
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False, filenumber=None):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url) matches = self.link_pattern.findall(url)
if not len(matches): if not len(matches):
return False return False
status = "success" status = "success"
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url, filenumber)
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
with self.client.start(): with self.client.start():
matches = list(matches[0]) matches = list(matches[0])
chat, post_id = matches[1], matches[2] chat, post_id = matches[1], matches[2]
post_id = int(post_id) post_id = int(post_id)
try: try:
post = self.client.get_messages(chat, ids=post_id) post = self.client.get_messages(chat, ids=post_id)
except ValueError as e: except ValueError as e:
@ -65,9 +67,13 @@ class TelethonArchiver(Archiver):
if len(media_posts) > 1: if len(media_posts) > 1:
key = self.get_html_key(url) key = self.get_html_key(url)
cdn_url = self.storage.get_cdn_url(key)
if filenumber is not None:
key = filenumber + "/" + key
if check_if_exists and self.storage.exists(key): if check_if_exists and self.storage.exists(key):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
status = 'already archived' status = 'already archived'
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
@ -78,19 +84,26 @@ class TelethonArchiver(Archiver):
if len(mp.message) > len(message): message = mp.message if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1] key = filename.split('tmp/')[1]
if filenumber is not None:
key = filenumber + "/" + key
self.storage.upload(filename, key) self.storage.upload(filename, key)
hash = self.get_hash(filename) hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
os.remove(filename) os.remove(filename)
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)), filenumber=filenumber)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1: elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}') key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'tmp/{key}') filename = self.client.download_media(post.media, f'tmp/{key}')
key = filename.split('tmp/')[1].replace(" ", "") key = filename.split('tmp/')[1].replace(" ", "")
if filenumber is not None:
key = filenumber + "/" + key
self.storage.upload(filename, key) self.storage.upload(filename, key)
hash = self.get_hash(filename) hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
@ -99,5 +112,5 @@ class TelethonArchiver(Archiver):
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)), filenumber=filenumber)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)

Wyświetl plik

@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult
class TiktokArchiver(Archiver): class TiktokArchiver(Archiver):
name = "tiktok" name = "tiktok"
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False, filenumber=None):
if 'tiktok.com' not in url: if 'tiktok.com' not in url:
return False return False
@ -54,11 +54,13 @@ class TiktokArchiver(Archiver):
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(), thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
hash=hash, screenshot=screenshot) hash=hash, screenshot=screenshot)
except tiktok_downloader.Except.InvalidUrl: except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL' status = 'Invalid URL'
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
return ArchiveResult(status=status) return ArchiveResult(status=status)
except: except:
error = traceback.format_exc() error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error) status = 'Other Tiktok error: ' + str(error)
logger.warning(f'Other Tiktok error' + str(error))
return ArchiveResult(status=status) return ArchiveResult(status=status)

Wyświetl plik

@ -1,6 +1,5 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger from loguru import logger
import requests
from urllib.parse import urlparse from urllib.parse import urlparse
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
@ -9,7 +8,8 @@ from .base_archiver import Archiver, ArchiveResult
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
name = "twitter" name = "twitter"
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False, filenumber=None):
if 'twitter.com' != self.get_netloc(url): if 'twitter.com' != self.get_netloc(url):
return False return False
@ -24,11 +24,14 @@ class TwitterArchiver(Archiver):
try: try:
tweet = next(scr.get_items()) tweet = next(scr.get_items())
except: except Exception as ex:
logger.warning('wah wah') template = "TwitterArchiver cant get tweet and threw, which can happen if a media sensitive tweet. \n type: {0} occurred. \n arguments:{1!r}"
message = template.format(type(ex).__name__, ex.args)
logger.warning(message)
return False return False
if tweet.media is None: if tweet.media is None:
logger.trace(f'No media found')
return False return False
urls = [] urls = []
@ -45,8 +48,8 @@ class TwitterArchiver(Archiver):
else: else:
logger.warning(f"Could not get media URL of {media}") logger.warning(f"Could not get media URL of {media}")
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber)
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url, filenumber)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)

Wyświetl plik

@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
from storages import Storage from storages import Storage
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from loguru import logger
class WaybackArchiver(Archiver): class WaybackArchiver(Archiver):
name = "wayback" name = "wayback"
@ -12,7 +14,7 @@ class WaybackArchiver(Archiver):
super(WaybackArchiver, self).__init__(storage, driver) super(WaybackArchiver, self).__init__(storage, driver)
self.seen_urls = {} self.seen_urls = {}
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False, filenumber=None):
if check_if_exists and url in self.seen_urls: if check_if_exists and url in self.seen_urls:
return self.seen_urls[url] return self.seen_urls[url]
@ -25,9 +27,11 @@ class WaybackArchiver(Archiver):
'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) 'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200: if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}")
return ArchiveResult(status="Internet archive failed") return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json(): if 'job_id' not in r.json() and 'message' in r.json():
logger.warning(f"Internet archive failed json \n {r.json()}")
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id'] job_id = r.json()['job_id']
@ -71,7 +75,7 @@ class WaybackArchiver(Archiver):
except: except:
title = "Could not get title" title = "Could not get title"
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url, filenumber)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot) result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
self.seen_urls[url] = result self.seen_urls[url] = result
return result return result

Wyświetl plik

@ -15,7 +15,7 @@ class YoutubeDLArchiver(Archiver):
super().__init__(storage, driver) super().__init__(storage, driver)
self.fb_cookie = fb_cookie self.fb_cookie = fb_cookie
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False, filenumber=None):
netloc = self.get_netloc(url) netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'www.facebook.com']: if netloc in ['facebook.com', 'www.facebook.com']:
logger.debug('Using Facebook cookie') logger.debug('Using Facebook cookie')
@ -27,13 +27,17 @@ class YoutubeDLArchiver(Archiver):
try: try:
info = ydl.extract_info(url, download=False) info = ydl.extract_info(url, download=False)
except yt_dlp.utils.DownloadError: except yt_dlp.utils.DownloadError as e:
# no video here logger.debug(f'No video - Youtube normal control flow: {e}')
return False
except Exception as e:
logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}')
return False return False
if info.get('is_live', False): if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now") logger.warning("Live streaming media, not archiving now")
return ArchiveResult(status="Streaming media") return ArchiveResult(status="Streaming media")
if 'twitter.com' in netloc: if 'twitter.com' in netloc:
if 'https://twitter.com/' in info['webpage_url']: if 'https://twitter.com/' in info['webpage_url']:
logger.info('Found https://twitter.com/ in the download url from Twitter') logger.info('Found https://twitter.com/ in the download url from Twitter')
@ -41,7 +45,6 @@ class YoutubeDLArchiver(Archiver):
logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet') logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
return False return False
if check_if_exists: if check_if_exists:
if 'entries' in info: if 'entries' in info:
if len(info['entries']) > 1: if len(info['entries']) > 1:
@ -58,6 +61,9 @@ class YoutubeDLArchiver(Archiver):
key = self.get_key(filename) key = self.get_key(filename)
if filenumber is not None:
key = filenumber + "/" + key
if self.storage.exists(key): if self.storage.exists(key):
status = 'already archived' status = 'already archived'
cdn_url = self.storage.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
@ -81,12 +87,19 @@ class YoutubeDLArchiver(Archiver):
if status != 'already archived': if status != 'already archived':
key = self.get_key(filename) key = self.get_key(filename)
cdn_url = self.storage.get_cdn_url(key)
if filenumber is not None:
key = filenumber + "/" + key
self.storage.upload(filename, key) self.storage.upload(filename, key)
# filename ='tmp/sDE-qZdi8p8.webm'
# key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
cdn_url = self.storage.get_cdn_url(key)
hash = self.get_hash(filename) hash = self.get_hash(filename)
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url, filenumber)
# get duration # get duration
duration = info.get('duration') duration = info.get('duration')

Wyświetl plik

@ -68,7 +68,7 @@ def expand_url(url):
return url return url
def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
gc = gspread.service_account(filename='service_account.json') gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet) sh = gc.open(sheet)
@ -78,6 +78,9 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
key=os.getenv('DO_SPACES_KEY'), key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET') secret=os.getenv('DO_SPACES_SECRET')
) )
gd_config = GDConfig(
root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'),
)
telegram_config = archivers.TelegramConfig( telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'), api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH') api_hash=os.getenv('TELEGRAM_API_HASH')
@ -91,12 +94,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
gw = GWorksheet(wks, header_row=header, columns=columns) gw = GWorksheet(wks, header_row=header, columns=columns)
if not gw.col_exists('url'): if not gw.col_exists('url'):
logger.warning( logger.info(
f'No "{columns["url"]}" column found, skipping worksheet {wks.title}') f'No "{columns["url"]}" column found, skipping worksheet {wks.title}')
continue continue
if not gw.col_exists('status'): if not gw.col_exists('status'):
logger.warning( logger.info(
f'No "{columns["status"]}" column found, skipping worksheet {wks.title}') f'No "{columns["status"]}" column found, skipping worksheet {wks.title}')
continue continue
@ -104,26 +107,30 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
s3_client = S3Storage(s3_config) s3_client = S3Storage(s3_config)
# order matters, first to succeed excludes remaining gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
active_archivers = [ gd_client = GDStorage(gd_config)
archivers.TelethonArchiver(s3_client, driver, telegram_config),
archivers.TelegramArchiver(s3_client, driver),
archivers.TiktokArchiver(s3_client, driver),
archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')),
archivers.TwitterArchiver(s3_client, driver),
archivers.WaybackArchiver(s3_client, driver)
]
# loop through rows in worksheet # loop through rows in worksheet
for row in range(1 + header, gw.count_rows() + 1): for row in range(1 + header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url') url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status') original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
if url != '' and status in ['', None]: if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress') gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url) url = expand_url(url)
if usefilenumber:
filenumber = gw.get_cell(row, 'filenumber')
logger.debug(f'filenumber is {filenumber}')
if filenumber == "":
logger.warning(f'Logic error on row {row} with url {url} - the feature flag for usefilenumber is True, yet cant find a corresponding filenumber')
gw.set_cell(row, 'status', 'Missing filenumber')
continue
else:
# We will use this through the app to differentiate between where to save
filenumber = None
# make a new driver so each spreadsheet row is idempotent # make a new driver so each spreadsheet row is idempotent
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
@ -134,24 +141,58 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
driver.set_window_size(1400, 2000) driver.set_window_size(1400, 2000)
# in seconds, telegram screenshots catch which don't come back # in seconds, telegram screenshots catch which don't come back
driver.set_page_load_timeout(120) driver.set_page_load_timeout(120)
# client
storage_client = None
if storage == "s3":
storage_client = s3_client
elif storage == "gd":
storage_client = gd_client
else:
raise ValueError(f'Cant get storage_client {storage_client}')
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelethonArchiver(storage_client, driver, telegram_config),
archivers.TelegramArchiver(storage_client, driver),
archivers.TiktokArchiver(storage_client, driver),
archivers.YoutubeDLArchiver(storage_client, driver, os.getenv('FACEBOOK_COOKIE')),
archivers.TwitterArchiver(storage_client, driver),
archivers.WaybackArchiver(storage_client, driver)
]
for archiver in active_archivers: for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}') logger.debug(f'Trying {archiver} on row {row}')
try: try:
if usefilenumber:
# using filenumber to store in folders so not checking for existence of that url
result = archiver.download(url, check_if_exists=False, filenumber=filenumber)
else:
result = archiver.download(url, check_if_exists=True) result = archiver.download(url, check_if_exists=True)
except Exception as e: except Exception as e:
result = False result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}') logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
if result: if result:
if result.status in ['success', 'already archived']: # IA is a Success I believe - or do we want to display a logger warning for it?
if result.status in ['success', 'already archived', 'Internet Archive fallback']:
result.status = archiver.name + \ result.status = archiver.name + \
": " + str(result.status) ": " + str(result.status)
logger.success( logger.success(
f'{archiver} succeeded on row {row}') f'{archiver} succeeded on row {row}, url {url}')
break break
# wayback has seen this url before so keep existing status
if "wayback: Internet Archive fallback" in result.status:
logger.success(
f'wayback has seen this url before so keep existing status on row {row}')
result.status = result.status.replace(' (duplicate)', '')
result.status = str(result.status) + " (duplicate)"
break
logger.warning( logger.warning(
f'{archiver} did not succeed on row {row}, final status: {result.status}') f'{archiver} did not succeed on {row=}, final status: {result.status}')
result.status = archiver.name + \ result.status = archiver.name + \
": " + str(result.status) ": " + str(result.status)
# get rid of driver so can reload on next row # get rid of driver so can reload on next row
@ -165,22 +206,34 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
@logger.catch @logger.catch
def main(): def main():
logger.debug(f'Passed args:{sys.argv}') logger.debug(f'Passed args:{sys.argv}')
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Automatically archive social media videos from a Google Sheets document') description='Automatically archive social media videos from a Google Sheets document')
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True) parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True)
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row') parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
parser.add_argument('--private', action='store_true', help='Store content without public access permission') parser.add_argument('--private', action='store_true', help='Store content without public access permission')
parser.add_argument('--use-filenumber-as-directory', action=argparse.BooleanOptionalAction, dest='usefilenumber', \
help='Will save files into a subfolder on cloud storage which has the File Number eg SM3012')
parser.add_argument('--storage', action='store', dest='storage', default='s3', \
help='s3 or gd storage. Default is s3. NOTE GD storage supports only using filenumber')
for k, v in GWorksheet.COLUMN_NAMES.items(): for k, v in GWorksheet.COLUMN_NAMES.items():
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})') parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})')
args = parser.parse_args() args = parser.parse_args()
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()} config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
logger.info(f'Opening document {args.sheet} for header {args.header}') logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber} and storage {args.storage}')
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
# args.filenumber is True (of type bool) when set or None when argument is not there
usefilenumber = False
if args.usefilenumber:
usefilenumber = True
mkdir_if_not_exists('tmp') mkdir_if_not_exists('tmp')
process_sheet(args.sheet, header=args.header, columns=config_columns) process_sheet(args.sheet, usefilenumber, args.storage, args.header, config_columns)
shutil.rmtree('tmp') shutil.rmtree('tmp')

Wyświetl plik

@ -17,5 +17,12 @@ class Storage(ABC):
def upload(self, filename: str, key: str, **kwargs): def upload(self, filename: str, key: str, **kwargs):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
# S3 requires an open file, GD only the filename
storage = type(self).__name__
if storage == "GDStorage":
self.uploadf(filename, key, **kwargs)
elif storage == "S3Storage":
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs) self.uploadf(f, key, **kwargs)
else:
raise ValueError('Cant get storage thrown from base_storage.py')

Wyświetl plik

@ -0,0 +1,202 @@
from loguru import logger
from .base_storage import Storage
from dataclasses import dataclass
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.oauth2 import service_account
import time
@dataclass
class GDConfig:
root_folder_id: str
class GDStorage(Storage):
def __init__(self, config: GDConfig):
self.root_folder_id = config.root_folder_id
SCOPES = ['https://www.googleapis.com/auth/drive']
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
self.service = build('drive', 'v3', credentials=creds)
def _get_path(self, key):
return self.folder + key
def get_cdn_url(self, key):
# only support files saved in a folders for GD
# S3 supports folder and all stored in the root
# key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg
foldername = key.split('/', 1)[0]
# eg twitter__media_asdf.jpg
filename = key.split('/', 1)[1]
logger.debug(f'Looking for {foldername} and filename: {filename} on GD')
# retry policy on Google Drive
try_again = True
counter = 1
folder_id = None
while try_again:
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
and name = '{foldername}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
for item in items:
logger.debug(f"found folder of {item['name']}")
folder_id= item['id']
try_again = False
if folder_id is None:
logger.debug(f'Cant find {foldername=} waiting and trying again {counter=}')
counter += 1
time.sleep(10)
if counter > 18:
raise ValueError(f'Cant find {foldername} and retried 18 times pausing 10seconds at a time which is 3 minutes')
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
# happens doing thumbnails
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'index.html'
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
# get id of the sub folder
results = self.service.files().list(q=f"'{folder_id}' in parents \
and mimeType='application/vnd.google-apps.folder' \
and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
filename = None
for item in items:
folder_id = item['id']
filename = b
if filename is None:
raise ValueError(f'Problem finding sub folder {a}')
# get id of file inside folder (or sub folder)
results = self.service.files().list(q=f"'{folder_id}' in parents \
and name = '{filename}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
file_id = None
for item in items:
logger.debug(f"found file of {item['name']}")
file_id= item['id']
if file_id is None:
raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
return foo
def exists(self, key):
# Not implemented yet
# Google drive will accept duplicate named filenames as it is stored as a different fileid
# try:
# self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
# return True
# except ClientError:
# return False
return False
def uploadf(self, file, key, **kwargs):
# split on first occurance of /
# eg SM0005
foldername = key.split('/', 1)[0]
# eg twitter__media_asdf.jpg
filename = key.split('/', 1)[1]
# does folder eg SM0005 exist already inside parent of Files auto-archiver
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
and mimeType='application/vnd.google-apps.folder' \
and name = '{foldername}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {foldername} which should never happen, but continuing anyway')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
folder_id_to_upload_to = item['id']
if folder_id_to_upload_to is None:
logger.debug(f'Creating new folder {foldername}')
file_metadata = {
'name': [foldername],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [self.root_folder_id]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
folder_id_to_upload_to = gd_file.get('id')
# check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg'
# happens doing thumbnails
# will always return a and a blank b even if there is nothing to split
# https://stackoverflow.com/a/38149500/26086
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'out1.jpg'
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \
and mimeType='application/vnd.google-apps.folder' \
and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
sub_folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {a} which should never happen')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
sub_folder_id_to_upload_to = item['id']
if sub_folder_id_to_upload_to is None:
# create new folder
file_metadata = {
'name': [a],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [folder_id_to_upload_to]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
sub_folder_id_to_upload_to = gd_file.get('id')
filename = b
folder_id_to_upload_to = sub_folder_id_to_upload_to
# back to normal control flow
# else:
# upload file to gd
file_metadata = {
# 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
'name': [filename],
'parents': [folder_id_to_upload_to]
}
media = MediaFileUpload(file, resumable=True)
gd_file = self.service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()

Wyświetl plik

@ -9,6 +9,7 @@ class GWorksheet:
eg: if header=4, row 5 will be the first with data. eg: if header=4, row 5 will be the first with data.
""" """
COLUMN_NAMES = { COLUMN_NAMES = {
'filenumber': 'file number',
'url': 'link', 'url': 'link',
'archive': 'archive location', 'archive': 'archive location',
'date': 'archive date', 'date': 'archive date',