kopia lustrzana https://github.com/bellingcat/auto-archiver
Fix Selenium driver issues with telegram links
rodzic
538bb05395
commit
398f296789
|
@ -8,6 +8,8 @@ from urllib.parse import urlparse
|
||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
import requests
|
import requests
|
||||||
|
from loguru import logger
|
||||||
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
|
||||||
from storages import Storage
|
from storages import Storage
|
||||||
from utils import mkdir_if_not_exists
|
from utils import mkdir_if_not_exists
|
||||||
|
@ -54,6 +56,7 @@ class Archiver(ABC):
|
||||||
for url_info in urls_info:
|
for url_info in urls_info:
|
||||||
page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''
|
page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''
|
||||||
|
|
||||||
|
# TODO/ISSUE: character encoding is incorrect for Cyrillic, produces garbled text
|
||||||
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
|
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
|
||||||
page += f"</body></html>"
|
page += f"</body></html>"
|
||||||
|
|
||||||
|
@ -125,8 +128,11 @@ class Archiver(ABC):
|
||||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
|
|
||||||
self.driver.get(url)
|
try:
|
||||||
time.sleep(6)
|
self.driver.get(url)
|
||||||
|
time.sleep(6)
|
||||||
|
except TimeoutException:
|
||||||
|
logger.info("TimeoutException loading page for screenshot")
|
||||||
|
|
||||||
self.driver.save_screenshot(filename)
|
self.driver.save_screenshot(filename)
|
||||||
self.storage.upload(filename, key, extra_args={
|
self.storage.upload(filename, key, extra_args={
|
||||||
|
|
|
@ -76,7 +76,7 @@ class TelethonArchiver(Archiver):
|
||||||
uploaded_media = []
|
uploaded_media = []
|
||||||
message = post.message
|
message = post.message
|
||||||
for mp in media_posts:
|
for mp in media_posts:
|
||||||
if len(mp.message) > message: message = mp.message
|
if len(mp.message) > len(message): message = mp.message
|
||||||
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
|
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
|
||||||
key = filename.split('tmp/')[1]
|
key = filename.split('tmp/')[1]
|
||||||
self.storage.upload(filename, key)
|
self.storage.upload(filename, key)
|
||||||
|
|
|
@ -78,8 +78,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||||
|
|
||||||
options = webdriver.FirefoxOptions()
|
options = webdriver.FirefoxOptions()
|
||||||
options.headless = True
|
options.headless = True
|
||||||
driver = webdriver.Firefox(options=options)
|
profile = webdriver.FirefoxProfile()
|
||||||
|
profile.set_preference('network.protocol-handler.external.tg', False)
|
||||||
|
|
||||||
|
driver = webdriver.Firefox(profile, options=options)
|
||||||
driver.set_window_size(1400, 2000)
|
driver.set_window_size(1400, 2000)
|
||||||
|
driver.set_page_load_timeout(10)
|
||||||
|
|
||||||
# loop through worksheets to check
|
# loop through worksheets to check
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
|
|
|
@ -20,6 +20,7 @@ class S3Storage(Storage):
|
||||||
self.bucket = config.bucket
|
self.bucket = config.bucket
|
||||||
self.region = config.region
|
self.region = config.region
|
||||||
self.folder = config.folder
|
self.folder = config.folder
|
||||||
|
self.private = config.private
|
||||||
|
|
||||||
if len(self.folder) and self.folder[-1] != '/':
|
if len(self.folder) and self.folder[-1] != '/':
|
||||||
self.folder += '/'
|
self.folder += '/'
|
||||||
|
|
Ładowanie…
Reference in New Issue