kopia lustrzana https://github.com/bellingcat/auto-archiver
refactoring filenumber into subfolder
rodzic
03aa02e88b
commit
159adf9afe
12
README.md
12
README.md
|
@ -101,24 +101,18 @@ graph TD
|
||||||
A(BaseStorage) -->|parent of| C(GoogleDriveStorage)
|
A(BaseStorage) -->|parent of| C(GoogleDriveStorage)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Saving into Folders
|
## Saving into Subfolders
|
||||||
|
|
||||||
To use a column from the spreadsheet called `File Number` eg SM001234 as a directory on the cloud storage, you need to pass in
|
You can have a column in the spreadsheet for the argument `--col-subfolder` that is passed to the storage and can specify a subfolder to put the archived link into.
|
||||||
|
|
||||||
```bash
|
|
||||||
python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory
|
|
||||||
```
|
|
||||||
|
|
||||||
## Google Drive
|
## Google Drive
|
||||||
|
|
||||||
To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
|
To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python auto_archive.py --sheet 'Sheet Name' --use-filenumber-as-directory --storage='gd'
|
python auto_archive.py --sheet 'Sheet Name' --storage='gd'
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the you must use filenumber for Google Drive Storage.
|
|
||||||
|
|
||||||
## Telethon (Telegrams API Library)
|
## Telethon (Telegrams API Library)
|
||||||
|
|
||||||
Put your `anon.session` in the root, so that it doesn't stall and ask for authentication
|
Put your `anon.session` in the root, so that it doesn't stall and ask for authentication
|
||||||
|
|
|
@ -18,6 +18,7 @@ from selenium.webdriver.common.by import By
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from selenium.common.exceptions import TimeoutException
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ArchiveResult:
|
class ArchiveResult:
|
||||||
status: str
|
status: str
|
||||||
|
@ -42,7 +43,7 @@ class Archiver(ABC):
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def download(self, url, check_if_exists=False, filenumber=None): pass
|
def download(self, url, check_if_exists=False): pass
|
||||||
|
|
||||||
def get_netloc(self, url):
|
def get_netloc(self, url):
|
||||||
return urlparse(url).netloc
|
return urlparse(url).netloc
|
||||||
|
@ -51,7 +52,7 @@ class Archiver(ABC):
|
||||||
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||||
|
|
||||||
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
||||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None):
|
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||||
<body>
|
<body>
|
||||||
<h2>Archived media from {self.name}</h2>
|
<h2>Archived media from {self.name}</h2>
|
||||||
|
@ -71,10 +72,6 @@ class Archiver(ABC):
|
||||||
|
|
||||||
page_hash = self.get_hash(page_filename)
|
page_hash = self.get_hash(page_filename)
|
||||||
|
|
||||||
if filenumber != None:
|
|
||||||
logger.trace(f'filenumber for directory is {filenumber}')
|
|
||||||
page_key = filenumber + "/" + page_key
|
|
||||||
|
|
||||||
self.storage.upload(page_filename, page_key, extra_args={
|
self.storage.upload(page_filename, page_key, extra_args={
|
||||||
'ACL': 'public-read', 'ContentType': 'text/html'})
|
'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||||
|
|
||||||
|
@ -82,7 +79,7 @@ class Archiver(ABC):
|
||||||
return (page_cdn, page_hash, thumbnail)
|
return (page_cdn, page_hash, thumbnail)
|
||||||
|
|
||||||
# eg images in a tweet save to cloud storage
|
# eg images in a tweet save to cloud storage
|
||||||
def generate_media_page(self, urls, url, object, filenumber=None):
|
def generate_media_page(self, urls, url, object):
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||||
}
|
}
|
||||||
|
@ -102,10 +99,6 @@ class Archiver(ABC):
|
||||||
with open(filename, 'wb') as f:
|
with open(filename, 'wb') as f:
|
||||||
f.write(d.content)
|
f.write(d.content)
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
logger.debug(f'filenumber for directory is {filenumber}')
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
|
|
||||||
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||||
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
|
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||||
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||||
|
@ -120,7 +113,7 @@ class Archiver(ABC):
|
||||||
thumbnail = cdn_url
|
thumbnail = cdn_url
|
||||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||||
|
|
||||||
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber)
|
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
|
||||||
|
|
||||||
def get_key(self, filename):
|
def get_key(self, filename):
|
||||||
"""
|
"""
|
||||||
|
@ -141,15 +134,14 @@ class Archiver(ABC):
|
||||||
f = open(filename, "rb")
|
f = open(filename, "rb")
|
||||||
bytes = f.read() # read entire file as bytes
|
bytes = f.read() # read entire file as bytes
|
||||||
|
|
||||||
|
# TODO: customizable hash
|
||||||
hash = hashlib.sha256(bytes)
|
hash = hashlib.sha256(bytes)
|
||||||
# option to use SHA3_512 instead
|
# option to use SHA3_512 instead
|
||||||
# hash = hashlib.sha3_512(bytes)
|
# hash = hashlib.sha3_512(bytes)
|
||||||
f.close()
|
f.close()
|
||||||
return hash.hexdigest()
|
return hash.hexdigest()
|
||||||
|
|
||||||
# eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png
|
def get_screenshot(self, url):
|
||||||
# def get_screenshot(self, url, filenumber, storage="GD"):
|
|
||||||
def get_screenshot(self, url, filenumber):
|
|
||||||
key = self.get_key(urlparse(url).path.replace(
|
key = self.get_key(urlparse(url).path.replace(
|
||||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
|
@ -159,7 +151,7 @@ class Archiver(ABC):
|
||||||
try:
|
try:
|
||||||
logger.debug(f'Trying fb click accept cookie popup for {url}')
|
logger.debug(f'Trying fb click accept cookie popup for {url}')
|
||||||
self.driver.get("http://www.facebook.com")
|
self.driver.get("http://www.facebook.com")
|
||||||
foo = self.driver.find_element(By.XPATH,"//button[@data-cookiebanner='accept_only_essential_button']")
|
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
|
||||||
foo.click()
|
foo.click()
|
||||||
logger.debug(f'fb click worked')
|
logger.debug(f'fb click worked')
|
||||||
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
|
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
|
||||||
|
@ -174,11 +166,6 @@ class Archiver(ABC):
|
||||||
logger.info("TimeoutException loading page for screenshot")
|
logger.info("TimeoutException loading page for screenshot")
|
||||||
|
|
||||||
self.driver.save_screenshot(filename)
|
self.driver.save_screenshot(filename)
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
logger.debug(f'filenumber for directory is {filenumber}')
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
|
|
||||||
self.storage.upload(filename, key, extra_args={
|
self.storage.upload(filename, key, extra_args={
|
||||||
'ACL': 'public-read', 'ContentType': 'image/png'})
|
'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult
|
||||||
class TelegramArchiver(Archiver):
|
class TelegramArchiver(Archiver):
|
||||||
name = "telegram"
|
name = "telegram"
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False, filenumber=None):
|
def download(self, url, check_if_exists=False):
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
if 't.me' != self.get_netloc(url):
|
if 't.me' != self.get_netloc(url):
|
||||||
return False
|
return False
|
||||||
|
@ -27,7 +27,7 @@ class TelegramArchiver(Archiver):
|
||||||
if url[-8:] != "?embed=1":
|
if url[-8:] != "?embed=1":
|
||||||
url += "?embed=1"
|
url += "?embed=1"
|
||||||
|
|
||||||
screenshot = self.get_screenshot(url, filenumber=filenumber)
|
screenshot = self.get_screenshot(url)
|
||||||
|
|
||||||
t = requests.get(url, headers=headers)
|
t = requests.get(url, headers=headers)
|
||||||
s = BeautifulSoup(t.content, 'html.parser')
|
s = BeautifulSoup(t.content, 'html.parser')
|
||||||
|
@ -42,7 +42,7 @@ class TelegramArchiver(Archiver):
|
||||||
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
||||||
images += urls
|
images += urls
|
||||||
|
|
||||||
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber)
|
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))
|
||||||
time_elements = s.find_all('time')
|
time_elements = s.find_all('time')
|
||||||
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
||||||
|
|
||||||
|
@ -52,9 +52,6 @@ class TelegramArchiver(Archiver):
|
||||||
video_id = video_url.split('/')[-1].split('?')[0]
|
video_id = video_url.split('/')[-1].split('?')[0]
|
||||||
key = self.get_key(video_id)
|
key = self.get_key(video_id)
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
|
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ from loguru import logger
|
||||||
from storages import Storage
|
from storages import Storage
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
from telethon.sync import TelegramClient
|
from telethon.sync import TelegramClient
|
||||||
|
from telethon.errors import ChannelInvalidError
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -41,14 +42,14 @@ class TelethonArchiver(Archiver):
|
||||||
media.append(post)
|
media.append(post)
|
||||||
return media
|
return media
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False, filenumber=None):
|
def download(self, url, check_if_exists=False):
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
matches = self.link_pattern.findall(url)
|
matches = self.link_pattern.findall(url)
|
||||||
if not len(matches):
|
if not len(matches):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
status = "success"
|
status = "success"
|
||||||
screenshot = self.get_screenshot(url, filenumber)
|
screenshot = self.get_screenshot(url)
|
||||||
|
|
||||||
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
|
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
|
||||||
with self.client.start():
|
with self.client.start():
|
||||||
|
@ -60,7 +61,11 @@ class TelethonArchiver(Archiver):
|
||||||
try:
|
try:
|
||||||
post = self.client.get_messages(chat, ids=post_id)
|
post = self.client.get_messages(chat, ids=post_id)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.warning(f'Could not fetch telegram {url} possibly it\'s private: {e}')
|
logger.error(f'Could not fetch telegram {url} possibly it\'s private: {e}')
|
||||||
|
return False
|
||||||
|
except ChannelInvalidError as e:
|
||||||
|
# TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819
|
||||||
|
logger.error(f'Could not fetch telegram {url} possibly it\'s private or not displayable in : {e}')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
media_posts = self._get_media_posts_in_group(chat, post)
|
media_posts = self._get_media_posts_in_group(chat, post)
|
||||||
|
@ -68,11 +73,8 @@ class TelethonArchiver(Archiver):
|
||||||
if len(media_posts) > 1:
|
if len(media_posts) > 1:
|
||||||
key = self.get_html_key(url)
|
key = self.get_html_key(url)
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
|
|
||||||
if check_if_exists and self.storage.exists(key):
|
if check_if_exists and self.storage.exists(key):
|
||||||
# only s3 storage supports storage.exists as not implemented on gd
|
# only s3 storage supports storage.exists as not implemented on gd
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
status = 'already archived'
|
status = 'already archived'
|
||||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
|
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
|
||||||
|
@ -84,26 +86,19 @@ class TelethonArchiver(Archiver):
|
||||||
if len(mp.message) > len(message): message = mp.message
|
if len(mp.message) > len(message): message = mp.message
|
||||||
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
|
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
|
||||||
key = filename.split('tmp/')[1]
|
key = filename.split('tmp/')[1]
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
self.storage.upload(filename, key)
|
self.storage.upload(filename, key)
|
||||||
hash = self.get_hash(filename)
|
hash = self.get_hash(filename)
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)), filenumber=filenumber)
|
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
||||||
elif len(media_posts) == 1:
|
elif len(media_posts) == 1:
|
||||||
key = self.get_key(f'{chat}_{post_id}')
|
key = self.get_key(f'{chat}_{post_id}')
|
||||||
filename = self.client.download_media(post.media, f'tmp/{key}')
|
filename = self.client.download_media(post.media, f'tmp/{key}')
|
||||||
key = filename.split('tmp/')[1].replace(" ", "")
|
key = filename.split('tmp/')[1].replace(" ", "")
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
|
|
||||||
self.storage.upload(filename, key)
|
self.storage.upload(filename, key)
|
||||||
hash = self.get_hash(filename)
|
hash = self.get_hash(filename)
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
@ -112,5 +107,5 @@ class TelethonArchiver(Archiver):
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
|
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
|
||||||
|
|
||||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)), filenumber=filenumber)
|
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
||||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult
|
||||||
class TiktokArchiver(Archiver):
|
class TiktokArchiver(Archiver):
|
||||||
name = "tiktok"
|
name = "tiktok"
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False, filenumber=None):
|
def download(self, url, check_if_exists=False):
|
||||||
if 'tiktok.com' not in url:
|
if 'tiktok.com' not in url:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult
|
||||||
class TwitterArchiver(Archiver):
|
class TwitterArchiver(Archiver):
|
||||||
name = "twitter"
|
name = "twitter"
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False, filenumber=None):
|
def download(self, url, check_if_exists=False):
|
||||||
|
|
||||||
if 'twitter.com' != self.get_netloc(url):
|
if 'twitter.com' != self.get_netloc(url):
|
||||||
return False
|
return False
|
||||||
|
@ -16,7 +16,7 @@ class TwitterArchiver(Archiver):
|
||||||
tweet_id = urlparse(url).path.split('/')
|
tweet_id = urlparse(url).path.split('/')
|
||||||
if 'status' in tweet_id:
|
if 'status' in tweet_id:
|
||||||
i = tweet_id.index('status')
|
i = tweet_id.index('status')
|
||||||
tweet_id = tweet_id[i+1]
|
tweet_id = tweet_id[i + 1]
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -25,9 +25,7 @@ class TwitterArchiver(Archiver):
|
||||||
try:
|
try:
|
||||||
tweet = next(scr.get_items())
|
tweet = next(scr.get_items())
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
template = "TwitterArchiver cant get tweet and threw, which can happen if a media sensitive tweet. \n type: {0} occurred. \n arguments:{1!r}"
|
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
|
||||||
message = template.format(type(ex).__name__, ex.args)
|
|
||||||
logger.warning(message)
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if tweet.media is None:
|
if tweet.media is None:
|
||||||
|
@ -48,8 +46,8 @@ class TwitterArchiver(Archiver):
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Could not get media URL of {media}")
|
logger.warning(f"Could not get media URL of {media}")
|
||||||
|
|
||||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber)
|
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
|
||||||
|
|
||||||
screenshot = self.get_screenshot(url, filenumber)
|
screenshot = self.get_screenshot(url)
|
||||||
|
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
|
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
|
||||||
|
|
|
@ -14,7 +14,7 @@ class WaybackArchiver(Archiver):
|
||||||
super(WaybackArchiver, self).__init__(storage, driver)
|
super(WaybackArchiver, self).__init__(storage, driver)
|
||||||
self.seen_urls = {}
|
self.seen_urls = {}
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False, filenumber=None):
|
def download(self, url, check_if_exists=False):
|
||||||
if check_if_exists and url in self.seen_urls:
|
if check_if_exists and url in self.seen_urls:
|
||||||
return self.seen_urls[url]
|
return self.seen_urls[url]
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ class WaybackArchiver(Archiver):
|
||||||
except:
|
except:
|
||||||
title = "Could not get title"
|
title = "Could not get title"
|
||||||
|
|
||||||
screenshot = self.get_screenshot(url, filenumber)
|
screenshot = self.get_screenshot(url)
|
||||||
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
|
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||||
self.seen_urls[url] = result
|
self.seen_urls[url] = result
|
||||||
return result
|
return result
|
||||||
|
|
|
@ -7,6 +7,7 @@ from loguru import logger
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
from storages import Storage
|
from storages import Storage
|
||||||
|
|
||||||
|
|
||||||
class YoutubeDLArchiver(Archiver):
|
class YoutubeDLArchiver(Archiver):
|
||||||
name = "youtube_dl"
|
name = "youtube_dl"
|
||||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||||
|
@ -15,7 +16,7 @@ class YoutubeDLArchiver(Archiver):
|
||||||
super().__init__(storage, driver)
|
super().__init__(storage, driver)
|
||||||
self.fb_cookie = fb_cookie
|
self.fb_cookie = fb_cookie
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False, filenumber=None):
|
def download(self, url, check_if_exists=False):
|
||||||
netloc = self.get_netloc(url)
|
netloc = self.get_netloc(url)
|
||||||
if netloc in ['facebook.com', 'www.facebook.com']:
|
if netloc in ['facebook.com', 'www.facebook.com']:
|
||||||
logger.debug('Using Facebook cookie')
|
logger.debug('Using Facebook cookie')
|
||||||
|
@ -61,9 +62,6 @@ class YoutubeDLArchiver(Archiver):
|
||||||
|
|
||||||
key = self.get_key(filename)
|
key = self.get_key(filename)
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
|
|
||||||
if self.storage.exists(key):
|
if self.storage.exists(key):
|
||||||
status = 'already archived'
|
status = 'already archived'
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
@ -87,10 +85,6 @@ class YoutubeDLArchiver(Archiver):
|
||||||
|
|
||||||
if status != 'already archived':
|
if status != 'already archived':
|
||||||
key = self.get_key(filename)
|
key = self.get_key(filename)
|
||||||
|
|
||||||
if filenumber is not None:
|
|
||||||
key = filenumber + "/" + key
|
|
||||||
|
|
||||||
self.storage.upload(filename, key)
|
self.storage.upload(filename, key)
|
||||||
|
|
||||||
# filename ='tmp/sDE-qZdi8p8.webm'
|
# filename ='tmp/sDE-qZdi8p8.webm'
|
||||||
|
@ -98,8 +92,7 @@ class YoutubeDLArchiver(Archiver):
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
hash = self.get_hash(filename)
|
hash = self.get_hash(filename)
|
||||||
screenshot = self.get_screenshot(url, filenumber)
|
screenshot = self.get_screenshot(url)
|
||||||
|
|
||||||
|
|
||||||
# get duration
|
# get duration
|
||||||
duration = info.get('duration')
|
duration = info.get('duration')
|
||||||
|
@ -115,9 +108,9 @@ class YoutubeDLArchiver(Archiver):
|
||||||
|
|
||||||
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
|
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
|
||||||
if 'timestamp' in info else \
|
if 'timestamp' in info else \
|
||||||
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
|
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
|
||||||
if 'upload_date' in info and info['upload_date'] is not None else \
|
if 'upload_date' in info and info['upload_date'] is not None else \
|
||||||
None
|
None
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||||
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)
|
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)
|
||||||
|
|
|
@ -23,6 +23,7 @@ logger.add("logs/5error.log", level="ERROR")
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
def update_sheet(gw, row, result: archivers.ArchiveResult):
|
def update_sheet(gw, row, result: archivers.ArchiveResult):
|
||||||
cell_updates = []
|
cell_updates = []
|
||||||
row_values = gw.get_row(row)
|
row_values = gw.get_row(row)
|
||||||
|
@ -68,7 +69,7 @@ def expand_url(url):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
|
def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||||
gc = gspread.service_account(filename='service_account.json')
|
gc = gspread.service_account(filename='service_account.json')
|
||||||
sh = gc.open(sheet)
|
sh = gc.open(sheet)
|
||||||
|
|
||||||
|
@ -86,8 +87,6 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW
|
||||||
api_hash=os.getenv('TELEGRAM_API_HASH')
|
api_hash=os.getenv('TELEGRAM_API_HASH')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# loop through worksheets to check
|
# loop through worksheets to check
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}')
|
logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}')
|
||||||
|
@ -121,16 +120,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW
|
||||||
|
|
||||||
url = expand_url(url)
|
url = expand_url(url)
|
||||||
|
|
||||||
if usefilenumber:
|
subfolder = gw.get_cell_or_default(row, 'subfolder')
|
||||||
filenumber = gw.get_cell(row, 'filenumber')
|
|
||||||
logger.debug(f'filenumber is {filenumber}')
|
|
||||||
if filenumber == "":
|
|
||||||
logger.warning(f'Logic error on row {row} with url {url} - the feature flag for usefilenumber is True, yet cant find a corresponding filenumber')
|
|
||||||
gw.set_cell(row, 'status', 'Missing filenumber')
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# We will use this through the app to differentiate between where to save
|
|
||||||
filenumber = None
|
|
||||||
|
|
||||||
# make a new driver so each spreadsheet row is idempotent
|
# make a new driver so each spreadsheet row is idempotent
|
||||||
options = webdriver.FirefoxOptions()
|
options = webdriver.FirefoxOptions()
|
||||||
|
@ -142,7 +132,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW
|
||||||
# in seconds, telegram screenshots catch which don't come back
|
# in seconds, telegram screenshots catch which don't come back
|
||||||
driver.set_page_load_timeout(120)
|
driver.set_page_load_timeout(120)
|
||||||
|
|
||||||
# client
|
# client
|
||||||
storage_client = None
|
storage_client = None
|
||||||
if storage == "s3":
|
if storage == "s3":
|
||||||
storage_client = s3_client
|
storage_client = s3_client
|
||||||
|
@ -150,6 +140,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW
|
||||||
storage_client = gd_client
|
storage_client = gd_client
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'Cant get storage_client {storage_client}')
|
raise ValueError(f'Cant get storage_client {storage_client}')
|
||||||
|
storage_client.update_properties(subfolder=subfolder)
|
||||||
|
|
||||||
# order matters, first to succeed excludes remaining
|
# order matters, first to succeed excludes remaining
|
||||||
active_archivers = [
|
active_archivers = [
|
||||||
|
@ -164,12 +155,12 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW
|
||||||
logger.debug(f'Trying {archiver} on row {row}')
|
logger.debug(f'Trying {archiver} on row {row}')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if usefilenumber:
|
result = archiver.download(url, check_if_exists=True)
|
||||||
# using filenumber to store in folders so not checking for existence of that url
|
except KeyboardInterrupt:
|
||||||
result = archiver.download(url, check_if_exists=False, filenumber=filenumber)
|
logger.warning("caught interrupt")
|
||||||
else:
|
gw.set_cell(row, 'status', '')
|
||||||
result = archiver.download(url, check_if_exists=True)
|
driver.quit()
|
||||||
|
exit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result = False
|
result = False
|
||||||
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
|
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
|
||||||
|
@ -180,7 +171,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW
|
||||||
result.status = archiver.name + \
|
result.status = archiver.name + \
|
||||||
": " + str(result.status)
|
": " + str(result.status)
|
||||||
logger.success(
|
logger.success(
|
||||||
f'{archiver} succeeded on row {row}, url {url}')
|
f'{archiver} succeeded on row {row}, url {url}')
|
||||||
break
|
break
|
||||||
|
|
||||||
# wayback has seen this url before so keep existing status
|
# wayback has seen this url before so keep existing status
|
||||||
|
@ -203,6 +194,7 @@ def process_sheet(sheet, usefilenumber=False, storage="s3", header=1, columns=GW
|
||||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||||
logger.success(f'Finshed worksheet {wks.title}')
|
logger.success(f'Finshed worksheet {wks.title}')
|
||||||
|
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def main():
|
def main():
|
||||||
logger.debug(f'Passed args:{sys.argv}')
|
logger.debug(f'Passed args:{sys.argv}')
|
||||||
|
@ -213,27 +205,21 @@ def main():
|
||||||
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
|
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
|
||||||
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
|
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
|
||||||
|
|
||||||
parser.add_argument('--use-filenumber-as-directory', action=argparse.BooleanOptionalAction, dest='usefilenumber', \
|
parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"})
|
||||||
help='Will save files into a subfolder on cloud storage which has the File Number eg SM3012')
|
|
||||||
parser.add_argument('--storage', action='store', dest='storage', default='s3', \
|
|
||||||
help='s3 or gd storage. Default is s3. NOTE GD storage supports only using filenumber')
|
|
||||||
|
|
||||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||||
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})')
|
help = f"the name of the column to fill with {k} (defaults={v})"
|
||||||
|
if k == "subfolder":
|
||||||
|
help = f"the name of the column to read the {k} from (defaults={v})"
|
||||||
|
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
|
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
|
||||||
|
|
||||||
logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber} and storage {args.storage}')
|
logger.info(f'Opening document {args.sheet} for header {args.header} and storage {args.storage}')
|
||||||
|
|
||||||
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
|
|
||||||
# args.filenumber is True (of type bool) when set or None when argument is not there
|
|
||||||
usefilenumber = False
|
|
||||||
if args.usefilenumber:
|
|
||||||
usefilenumber = True
|
|
||||||
|
|
||||||
mkdir_if_not_exists('tmp')
|
mkdir_if_not_exists('tmp')
|
||||||
process_sheet(args.sheet, usefilenumber, args.storage, args.header, config_columns)
|
process_sheet(args.sheet, args.storage, args.header, config_columns)
|
||||||
shutil.rmtree('tmp')
|
shutil.rmtree('tmp')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
class Storage(ABC):
|
class Storage(ABC):
|
||||||
|
@ -19,3 +20,25 @@ class Storage(ABC):
|
||||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
self.uploadf(f, key, **kwargs)
|
self.uploadf(f, key, **kwargs)
|
||||||
|
|
||||||
|
def update_properties(self, **kwargs):
|
||||||
|
"""
|
||||||
|
method used to update general properties that some children may use
|
||||||
|
and others not, but that all can call
|
||||||
|
"""
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
if k in self.get_allowed_properties():
|
||||||
|
setattr(self, k, v)
|
||||||
|
else:
|
||||||
|
logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
|
||||||
|
|
||||||
|
def get_allowed_properties(self):
|
||||||
|
"""
|
||||||
|
child classes should specify which properties they allow to be set
|
||||||
|
"""
|
||||||
|
return set(["subfolder"])
|
||||||
|
|
||||||
|
def clean_path(self, folder, default="", add_forward_slash=True):
|
||||||
|
if folder is None or type(folder) != str or len(folder.strip()) == 0:
|
||||||
|
return default
|
||||||
|
return str(Path(folder)) + ("/" if add_forward_slash else "")
|
||||||
|
|
|
@ -15,6 +15,7 @@ class GDConfig:
|
||||||
|
|
||||||
|
|
||||||
class GDStorage(Storage):
|
class GDStorage(Storage):
|
||||||
|
DEFAULT_UPLOAD_FOLDER_NAME = "default"
|
||||||
|
|
||||||
def __init__(self, config: GDConfig):
|
def __init__(self, config: GDConfig):
|
||||||
self.root_folder_id = config.root_folder_id
|
self.root_folder_id = config.root_folder_id
|
||||||
|
@ -22,19 +23,14 @@ class GDStorage(Storage):
|
||||||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
|
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
|
||||||
self.service = build('drive', 'v3', credentials=creds)
|
self.service = build('drive', 'v3', credentials=creds)
|
||||||
|
|
||||||
def _get_path(self, key):
|
|
||||||
return self.folder + key
|
|
||||||
|
|
||||||
def get_cdn_url(self, key):
|
def get_cdn_url(self, key):
|
||||||
# only support files saved in a folders for GD
|
"""
|
||||||
# S3 supports folder and all stored in the root
|
only support files saved in a folder for GD
|
||||||
|
S3 supports folder and all stored in the root
|
||||||
# key will be SM0002/twitter__media_ExeUSW2UcAE6RbN.jpg
|
"""
|
||||||
foldername = key.split('/', 1)[0]
|
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||||
# eg twitter__media_asdf.jpg
|
filename = key
|
||||||
filename = key.split('/', 1)[1]
|
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
|
||||||
|
|
||||||
logger.debug(f'Looking for {foldername} and filename: {filename} on GD')
|
|
||||||
|
|
||||||
# retry policy on Google Drive
|
# retry policy on Google Drive
|
||||||
try_again = True
|
try_again = True
|
||||||
|
@ -42,11 +38,11 @@ class GDStorage(Storage):
|
||||||
folder_id = None
|
folder_id = None
|
||||||
while try_again:
|
while try_again:
|
||||||
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
|
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
|
||||||
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
|
results = self.service.files().list(
|
||||||
and name = '{foldername}' ",
|
q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ",
|
||||||
spaces='drive', # ie not appDataFolder or photos
|
spaces='drive', # ie not appDataFolder or photos
|
||||||
fields='files(id, name)'
|
fields='files(id, name)'
|
||||||
).execute()
|
).execute()
|
||||||
items = results.get('files', [])
|
items = results.get('files', [])
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
|
@ -55,11 +51,11 @@ class GDStorage(Storage):
|
||||||
try_again = False
|
try_again = False
|
||||||
|
|
||||||
if folder_id is None:
|
if folder_id is None:
|
||||||
logger.debug(f'Cant find {foldername=} waiting and trying again {counter=}')
|
logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}')
|
||||||
counter += 1
|
counter += 1
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
if counter > 18:
|
if counter > 18:
|
||||||
raise ValueError(f'Cant find {foldername} and retried 18 times pausing 10seconds at a time which is 3 minutes')
|
raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes')
|
||||||
|
|
||||||
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
|
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
|
||||||
# happens doing thumbnails
|
# happens doing thumbnails
|
||||||
|
@ -71,12 +67,11 @@ class GDStorage(Storage):
|
||||||
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
|
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
|
||||||
|
|
||||||
# get id of the sub folder
|
# get id of the sub folder
|
||||||
results = self.service.files().list(q=f"'{folder_id}' in parents \
|
results = self.service.files().list(
|
||||||
and mimeType='application/vnd.google-apps.folder' \
|
q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||||
and name = '{a}' ",
|
spaces='drive', # ie not appDataFolder or photos
|
||||||
spaces='drive', # ie not appDataFolder or photos
|
fields='files(id, name)'
|
||||||
fields='files(id, name)'
|
).execute()
|
||||||
).execute()
|
|
||||||
items = results.get('files', [])
|
items = results.get('files', [])
|
||||||
|
|
||||||
filename = None
|
filename = None
|
||||||
|
@ -87,11 +82,11 @@ class GDStorage(Storage):
|
||||||
raise ValueError(f'Problem finding sub folder {a}')
|
raise ValueError(f'Problem finding sub folder {a}')
|
||||||
|
|
||||||
# get id of file inside folder (or sub folder)
|
# get id of file inside folder (or sub folder)
|
||||||
results = self.service.files().list(q=f"'{folder_id}' in parents \
|
results = self.service.files().list(
|
||||||
and name = '{filename}' ",
|
q=f"'{folder_id}' in parents and name = '{filename}' ",
|
||||||
spaces='drive',
|
spaces='drive',
|
||||||
fields='files(id, name)'
|
fields='files(id, name)'
|
||||||
).execute()
|
).execute()
|
||||||
items = results.get('files', [])
|
items = results.get('files', [])
|
||||||
|
|
||||||
file_id = None
|
file_id = None
|
||||||
|
@ -110,41 +105,36 @@ class GDStorage(Storage):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def uploadf(self, file, key, **_kwargs):
|
def uploadf(self, file, key, **_kwargs):
|
||||||
# split on first occurance of /
|
logger.debug(f"before {self.subfolder=}")
|
||||||
# eg SM0005
|
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||||
foldername = key.split('/', 1)[0]
|
filename = key
|
||||||
# eg twitter__media_asdf.jpg
|
logger.debug(f"after {self.subfolder=}")
|
||||||
filename = key.split('/', 1)[1]
|
|
||||||
|
|
||||||
# does folder eg SM0005 exist already inside parent of Files auto-archiver
|
# does folder eg SM0005 exist already inside parent of Files auto-archiver
|
||||||
results = self.service.files().list(q=f"'{self.root_folder_id}' in parents \
|
results = self.service.files().list(
|
||||||
and mimeType='application/vnd.google-apps.folder' \
|
q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ",
|
||||||
and name = '{foldername}' ",
|
spaces='drive',
|
||||||
spaces='drive',
|
fields='files(id, name)'
|
||||||
fields='files(id, name)'
|
).execute()
|
||||||
).execute()
|
|
||||||
items = results.get('files', [])
|
items = results.get('files', [])
|
||||||
folder_id_to_upload_to = None
|
folder_id_to_upload_to = None
|
||||||
if len(items) > 1:
|
if len(items) > 1:
|
||||||
logger.error(f'Duplicate folder name of {foldername} which should never happen, but continuing anyway')
|
logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway')
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
logger.debug(f"Found existing folder of {item['name']}")
|
logger.debug(f"Found existing folder of {item['name']}")
|
||||||
folder_id_to_upload_to = item['id']
|
folder_id_to_upload_to = item['id']
|
||||||
|
|
||||||
if folder_id_to_upload_to is None:
|
if folder_id_to_upload_to is None:
|
||||||
logger.debug(f'Creating new folder {foldername}')
|
logger.debug(f'Creating new folder {self.subfolder}')
|
||||||
file_metadata = {
|
file_metadata = {
|
||||||
'name': [foldername],
|
'name': [self.subfolder],
|
||||||
'mimeType': 'application/vnd.google-apps.folder',
|
'mimeType': 'application/vnd.google-apps.folder',
|
||||||
'parents': [self.root_folder_id]
|
'parents': [self.root_folder_id]
|
||||||
}
|
}
|
||||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||||
folder_id_to_upload_to = gd_file.get('id')
|
folder_id_to_upload_to = gd_file.get('id')
|
||||||
|
|
||||||
# check for subfolder nema in file eg youtube_dl_sDE-qZdi8p8/out1.jpg'
|
# check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails
|
||||||
# happens doing thumbnails
|
|
||||||
|
|
||||||
# will always return a and a blank b even if there is nothing to split
|
# will always return a and a blank b even if there is nothing to split
|
||||||
# https://stackoverflow.com/a/38149500/26086
|
# https://stackoverflow.com/a/38149500/26086
|
||||||
a, _, b = filename.partition('/')
|
a, _, b = filename.partition('/')
|
||||||
|
@ -155,12 +145,11 @@ class GDStorage(Storage):
|
||||||
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
|
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
|
||||||
|
|
||||||
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
|
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
|
||||||
results = self.service.files().list(q=f"'{folder_id_to_upload_to}' in parents \
|
results = self.service.files().list(
|
||||||
and mimeType='application/vnd.google-apps.folder' \
|
q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||||
and name = '{a}' ",
|
spaces='drive', # ie not appDataFolder or photos
|
||||||
spaces='drive', # ie not appDataFolder or photos
|
fields='files(id, name)'
|
||||||
fields='files(id, name)'
|
).execute()
|
||||||
).execute()
|
|
||||||
items = results.get('files', [])
|
items = results.get('files', [])
|
||||||
sub_folder_id_to_upload_to = None
|
sub_folder_id_to_upload_to = None
|
||||||
if len(items) > 1:
|
if len(items) > 1:
|
||||||
|
@ -184,17 +173,13 @@ class GDStorage(Storage):
|
||||||
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
||||||
# back to normal control flow
|
# back to normal control flow
|
||||||
|
|
||||||
# else:
|
|
||||||
# upload file to gd
|
# upload file to gd
|
||||||
file_metadata = {
|
file_metadata = {
|
||||||
# 'name': 'twitter__media_FMQg7yeXwAAwNEi.jpg',
|
|
||||||
'name': [filename],
|
'name': [filename],
|
||||||
'parents': [folder_id_to_upload_to]
|
'parents': [folder_id_to_upload_to]
|
||||||
}
|
}
|
||||||
media = MediaFileUpload(file, resumable=True)
|
media = MediaFileUpload(file, resumable=True)
|
||||||
gd_file = self.service.files().create(body=file_metadata,
|
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||||
media_body=media,
|
|
||||||
fields='id').execute()
|
|
||||||
|
|
||||||
def upload(self, filename: str, key: str, **kwargs):
|
def upload(self, filename: str, key: str, **kwargs):
|
||||||
# GD only requires the filename not a file reader
|
# GD only requires the filename not a file reader
|
||||||
|
|
|
@ -2,6 +2,7 @@ import boto3
|
||||||
from botocore.errorfactory import ClientError
|
from botocore.errorfactory import ClientError
|
||||||
from .base_storage import Storage
|
from .base_storage import Storage
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -19,12 +20,9 @@ class S3Storage(Storage):
|
||||||
def __init__(self, config: S3Config):
|
def __init__(self, config: S3Config):
|
||||||
self.bucket = config.bucket
|
self.bucket = config.bucket
|
||||||
self.region = config.region
|
self.region = config.region
|
||||||
self.folder = config.folder
|
self.folder = self.clean_path(config.folder)
|
||||||
self.private = config.private
|
self.private = config.private
|
||||||
|
|
||||||
if len(self.folder) and self.folder[-1] != '/':
|
|
||||||
self.folder += '/'
|
|
||||||
|
|
||||||
self.s3 = boto3.client(
|
self.s3 = boto3.client(
|
||||||
's3',
|
's3',
|
||||||
region_name=self.region,
|
region_name=self.region,
|
||||||
|
@ -34,7 +32,7 @@ class S3Storage(Storage):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_path(self, key):
|
def _get_path(self, key):
|
||||||
return self.folder + key
|
return self.folder + self.clean_path(self.subfolder) + key
|
||||||
|
|
||||||
def get_cdn_url(self, key):
|
def get_cdn_url(self, key):
|
||||||
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
|
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
|
||||||
|
@ -47,9 +45,9 @@ class S3Storage(Storage):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def uploadf(self, file, key, **kwargs):
|
def uploadf(self, file, key, **kwargs):
|
||||||
|
logger.debug(f'[S3 storage] uploading {file=}, {key=}')
|
||||||
if self.private:
|
if self.private:
|
||||||
extra_args = kwargs.get("extra_args", {})
|
extra_args = kwargs.get("extra_args", {})
|
||||||
else:
|
else:
|
||||||
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
||||||
|
|
||||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
||||||
|
|
|
@ -9,8 +9,8 @@ class GWorksheet:
|
||||||
eg: if header=4, row 5 will be the first with data.
|
eg: if header=4, row 5 will be the first with data.
|
||||||
"""
|
"""
|
||||||
COLUMN_NAMES = {
|
COLUMN_NAMES = {
|
||||||
'filenumber': 'file number',
|
|
||||||
'url': 'link',
|
'url': 'link',
|
||||||
|
'subfolder': 'sub folder',
|
||||||
'archive': 'archive location',
|
'archive': 'archive location',
|
||||||
'date': 'archive date',
|
'date': 'archive date',
|
||||||
'status': 'archive status',
|
'status': 'archive status',
|
||||||
|
@ -69,6 +69,15 @@ class GWorksheet:
|
||||||
return ''
|
return ''
|
||||||
return row[col_index]
|
return row[col_index]
|
||||||
|
|
||||||
|
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False):
|
||||||
|
"""
|
||||||
|
return self.get_cell or default value on error (eg: column is missing)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return self.get_cell(row, col, fresh)
|
||||||
|
except:
|
||||||
|
return default
|
||||||
|
|
||||||
def set_cell(self, row: int, col: str, val):
|
def set_cell(self, row: int, col: str, val):
|
||||||
# row is 1-based
|
# row is 1-based
|
||||||
col_index = self._col_index(col) + 1
|
col_index = self._col_index(col) + 1
|
||||||
|
|
Ładowanie…
Reference in New Issue