From 398f296789ad4c38c39ab9473e925c72a40fe718 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 18 Mar 2022 11:10:27 +0100 Subject: [PATCH] Fix Selenium driver issues with telegram links --- archivers/base_archiver.py | 10 ++++++++-- archivers/telethon_archiver.py | 2 +- auto_archive.py | 6 +++++- storages/s3_storage.py | 1 + 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 687e762..b32bee0 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -8,6 +8,8 @@ from urllib.parse import urlparse import hashlib import time import requests +from loguru import logger +from selenium.common.exceptions import TimeoutException from storages import Storage from utils import mkdir_if_not_exists @@ -54,6 +56,7 @@ class Archiver(ABC): for url_info in urls_info: page += f'''
  • {url_info['key']}: {url_info['hash']}
  • ''' + # TODO/ISSUE: character encoding is incorrect for Cyrillic, produces garbled text page += f"

    {self.name} object data:

    {object}" page += f"" @@ -125,8 +128,11 @@ class Archiver(ABC): "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") filename = 'tmp/' + key - self.driver.get(url) - time.sleep(6) + try: + self.driver.get(url) + time.sleep(6) + except TimeoutException: + logger.info("TimeoutException loading page for screenshot") self.driver.save_screenshot(filename) self.storage.upload(filename, key, extra_args={ diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index c332854..f7ee53e 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -76,7 +76,7 @@ class TelethonArchiver(Archiver): uploaded_media = [] message = post.message for mp in media_posts: - if len(mp.message) > message: message = mp.message + if len(mp.message) > len(message): message = mp.message filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') key = filename.split('tmp/')[1] self.storage.upload(filename, key) diff --git a/auto_archive.py b/auto_archive.py index 1f00b2f..fea9bfb 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -78,8 +78,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): options = webdriver.FirefoxOptions() options.headless = True - driver = webdriver.Firefox(options=options) + profile = webdriver.FirefoxProfile() + profile.set_preference('network.protocol-handler.external.tg', False) + + driver = webdriver.Firefox(profile, options=options) driver.set_window_size(1400, 2000) + driver.set_page_load_timeout(10) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): diff --git a/storages/s3_storage.py b/storages/s3_storage.py index 53bb151..d7c9644 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -20,6 +20,7 @@ class S3Storage(Storage): self.bucket = config.bucket self.region = config.region self.folder = config.folder + self.private = config.private if len(self.folder) and self.folder[-1] != '/': self.folder += '/'