merge from master and fixes

pull/33/head
msramalho 2022-05-10 23:09:33 +02:00
rodzic f6e8da34b8
commit bca960b228
4 zmienionych plików z 95 dodań i 46 usunięć

Wyświetl plik

@ -17,7 +17,7 @@ class YoutubeDLArchiver(Archiver):
def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'www.facebook.com']:
if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.fb_cookie

Wyświetl plik

@ -1,18 +1,14 @@
# import os
import sys
import datetime
# import argparse
import shutil
# import gspread
from loguru import logger
from dotenv import load_dotenv
import traceback
# import archivers
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
from utils import GWorksheet, mkdir_if_not_exists, expand_url
from configs import Config
import sys
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
@ -79,16 +75,6 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/')
storage = c.get_storage()
# order matters, first to succeed excludes remaining
active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TelegramArchiver(storage, c.webdriver),
TiktokArchiver(storage, c.webdriver),
YoutubeDLArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver),
WaybackArchiver(storage, c.webdriver)
archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')),
]
# loop through rows in worksheet
for row in range(1 + header, gw.count_rows() + 1):
@ -99,17 +85,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
# make a new driver so each spreadsheet row is idempotent
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
c.recreate_webdriver()
# order matters, first to succeed excludes remaining
active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TelegramArchiver(storage, c.webdriver),
TiktokArchiver(storage, c.webdriver),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TwitterArchiver(storage, c.webdriver),
WaybackArchiver(storage, c.webdriver)
]
driver = webdriver.Firefox(options=options)
driver.set_window_size(1400, 2000)
# in seconds, telegram screenshots catch which don't come back
driver.set_page_load_timeout(120)
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
@ -121,23 +109,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
if result:
if result.status in ['success', 'already archived']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
f'{archiver} succeeded on row {row}')
result.status = f"{archiver.name}: {result.status}"
logger.success(f'{archiver} succeeded on row {row}')
break
logger.warning(
f'{archiver} did not succeed on row {row}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
# get rid of driver so can reload on next row
driver.quit()
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
result.status = f"{archiver.name}: {result.status}"
if result:
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
logger.success(f'Finshed worksheet {wks.title}')
@logger.catch
def main():
logger.debug(f'Passed args:{sys.argv}')

Wyświetl plik

@ -61,15 +61,20 @@ class Config:
# selenium driver
selenium_configs = execution.get("selenium", {})
self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10))
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
self.webdriver = webdriver.Firefox(options=options)
self.webdriver.set_window_size(1400, 2000)
self.webdriver.set_page_load_timeout(self.selenium_timeout)
self.webdriver = "not initalized"
secrets = self.config.get("secrets", {})
# APIs and service configurations
secrets = self.config.get("secrets", {})
# google sheets config
self.gsheets_client = gspread.service_account(
filename=secrets.get("google_api", {}).get("filename", 'service_account.json')
)
# facebook config
self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
# s3 config
if "s3" in secrets:
s3 = secrets["s3"]
self.s3_config = S3Config(
@ -86,6 +91,7 @@ class Config:
else:
logger.debug(f"'s3' key not present in the {self.config_file=}")
# wayback machine config
if "wayback" in secrets:
self.wayback_config = WaybackConfig(
key=secrets["wayback"]["key"],
@ -94,6 +100,7 @@ class Config:
else:
logger.debug(f"'wayback' key not present in the {self.config_file=}")
# telethon config
if "telegram" in secrets:
self.telegram_config = TelegramConfig(
api_id=secrets["telegram"]["api_id"],
@ -102,10 +109,6 @@ class Config:
else:
logger.debug(f"'telegram' key not present in the {self.config_file=}")
self.gsheets_client = gspread.service_account(
filename=secrets.get("google_api", {}).get("filename", 'service_account.json')
)
del self.config["secrets"]
def get_argument_parser(self):
@ -133,6 +136,17 @@ class Config:
return LocalStorage(self.folder)
raise f"storage {self.storage} not yet implemented"
def destroy_webdriver(self):
if self.webdriver is not None:
self.webdriver.quit()
def recreate_webdriver(self):
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
self.webdriver = webdriver.Firefox(options=options)
self.webdriver.set_window_size(1400, 2000)
self.webdriver.set_page_load_timeout(self.selenium_timeout)
def __str__(self) -> str:
return json.dumps({

51
test.py 100644
Wyświetl plik

@ -0,0 +1,51 @@
import os
import datetime
import argparse
import requests
import shutil
import gspread
from loguru import logger
from dotenv import load_dotenv
from selenium import webdriver
import traceback
import archivers
from storages import S3Storage, S3Config
from utils import GWorksheet, mkdir_if_not_exists
load_dotenv()
options = webdriver.FirefoxOptions()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.set_window_size(1400, 2000)
s3_config = S3Config(
bucket=os.getenv('DO_BUCKET'),
region=os.getenv('DO_SPACES_REGION'),
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET'),
folder="temp"
)
s3_client = S3Storage(s3_config)
telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH')
)
archiver = archivers.TelethonArchiver(s3_client, driver, telegram_config)
URLs = [
# "https://t.me/c/1226032830/24864",
# "https://t.me/truexanewsua/32650",
"https://t.me/informatsia_obstanovka/5239",
# "https://t.me/informatsia_obstanovka/5240",
# "https://t.me/informatsia_obstanovka/5241",
# "https://t.me/informatsia_obstanovka/5242"
]
for url in URLs:
print(url)
print(archiver.download(url, False))