Fix Selenium driver issues with telegram links

pull/21/head
Logan Williams 2022-03-18 11:10:27 +01:00
rodzic 538bb05395
commit 398f296789
4 zmienionych plików z 15 dodań i 4 usunięć

Wyświetl plik

@ -8,6 +8,8 @@ from urllib.parse import urlparse
import hashlib import hashlib
import time import time
import requests import requests
from loguru import logger
from selenium.common.exceptions import TimeoutException
from storages import Storage from storages import Storage
from utils import mkdir_if_not_exists from utils import mkdir_if_not_exists
@ -54,6 +56,7 @@ class Archiver(ABC):
for url_info in urls_info: for url_info in urls_info:
page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>''' page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''
# TODO/ISSUE: character encoding is incorrect for Cyrillic, produces garbled text
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>" page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
page += f"</body></html>" page += f"</body></html>"
@ -125,8 +128,11 @@ class Archiver(ABC):
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key filename = 'tmp/' + key
self.driver.get(url) try:
time.sleep(6) self.driver.get(url)
time.sleep(6)
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename) self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={ self.storage.upload(filename, key, extra_args={

Wyświetl plik

@ -76,7 +76,7 @@ class TelethonArchiver(Archiver):
uploaded_media = [] uploaded_media = []
message = post.message message = post.message
for mp in media_posts: for mp in media_posts:
if len(mp.message) > message: message = mp.message if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1] key = filename.split('tmp/')[1]
self.storage.upload(filename, key) self.storage.upload(filename, key)

Wyświetl plik

@ -78,8 +78,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.headless = True options.headless = True
driver = webdriver.Firefox(options=options) profile = webdriver.FirefoxProfile()
profile.set_preference('network.protocol-handler.external.tg', False)
driver = webdriver.Firefox(profile, options=options)
driver.set_window_size(1400, 2000) driver.set_window_size(1400, 2000)
driver.set_page_load_timeout(10)
# loop through worksheets to check # loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()): for ii, wks in enumerate(sh.worksheets()):

Wyświetl plik

@ -20,6 +20,7 @@ class S3Storage(Storage):
self.bucket = config.bucket self.bucket = config.bucket
self.region = config.region self.region = config.region
self.folder = config.folder self.folder = config.folder
self.private = config.private
if len(self.folder) and self.folder[-1] != '/': if len(self.folder) and self.folder[-1] != '/':
self.folder += '/' self.folder += '/'