kopia lustrzana https://github.com/bellingcat/auto-archiver
merge from master and fixes
rodzic
f6e8da34b8
commit
bca960b228
|
@ -17,7 +17,7 @@ class YoutubeDLArchiver(Archiver):
|
|||
|
||||
def download(self, url, check_if_exists=False):
|
||||
netloc = self.get_netloc(url)
|
||||
if netloc in ['facebook.com', 'www.facebook.com']:
|
||||
if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.fb_cookie
|
||||
|
||||
|
|
|
@ -1,18 +1,14 @@
|
|||
# import os
|
||||
import sys
|
||||
import datetime
|
||||
# import argparse
|
||||
import shutil
|
||||
# import gspread
|
||||
from loguru import logger
|
||||
from dotenv import load_dotenv
|
||||
|
||||
import traceback
|
||||
|
||||
# import archivers
|
||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
|
||||
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
||||
from configs import Config
|
||||
import sys
|
||||
|
||||
logger.add("logs/1trace.log", level="TRACE")
|
||||
logger.add("logs/2info.log", level="INFO")
|
||||
|
@ -79,16 +75,6 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
|||
c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/')
|
||||
storage = c.get_storage()
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TiktokArchiver(storage, c.webdriver),
|
||||
YoutubeDLArchiver(storage, c.webdriver),
|
||||
TwitterArchiver(storage, c.webdriver),
|
||||
WaybackArchiver(storage, c.webdriver)
|
||||
archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')),
|
||||
]
|
||||
|
||||
# loop through rows in worksheet
|
||||
for row in range(1 + header, gw.count_rows() + 1):
|
||||
|
@ -99,17 +85,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
|||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
||||
url = expand_url(url)
|
||||
|
||||
|
||||
# make a new driver so each spreadsheet row is idempotent
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
c.recreate_webdriver()
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TiktokArchiver(storage, c.webdriver),
|
||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||
TwitterArchiver(storage, c.webdriver),
|
||||
WaybackArchiver(storage, c.webdriver)
|
||||
]
|
||||
|
||||
driver = webdriver.Firefox(options=options)
|
||||
driver.set_window_size(1400, 2000)
|
||||
# in seconds, telegram screenshots catch which don't come back
|
||||
driver.set_page_load_timeout(120)
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on row {row}')
|
||||
|
||||
|
@ -121,23 +109,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
|||
|
||||
if result:
|
||||
if result.status in ['success', 'already archived']:
|
||||
result.status = archiver.name + \
|
||||
": " + str(result.status)
|
||||
logger.success(
|
||||
f'{archiver} succeeded on row {row}')
|
||||
result.status = f"{archiver.name}: {result.status}"
|
||||
logger.success(f'{archiver} succeeded on row {row}')
|
||||
break
|
||||
logger.warning(
|
||||
f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
||||
result.status = archiver.name + \
|
||||
": " + str(result.status)
|
||||
# get rid of driver so can reload on next row
|
||||
driver.quit()
|
||||
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
||||
result.status = f"{archiver.name}: {result.status}"
|
||||
|
||||
if result:
|
||||
update_sheet(gw, row, result)
|
||||
else:
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
logger.success(f'Finshed worksheet {wks.title}')
|
||||
|
||||
|
||||
@logger.catch
|
||||
def main():
|
||||
logger.debug(f'Passed args:{sys.argv}')
|
||||
|
|
|
@ -61,15 +61,20 @@ class Config:
|
|||
# selenium driver
|
||||
selenium_configs = execution.get("selenium", {})
|
||||
self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10))
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
self.webdriver = webdriver.Firefox(options=options)
|
||||
self.webdriver.set_window_size(1400, 2000)
|
||||
self.webdriver.set_page_load_timeout(self.selenium_timeout)
|
||||
self.webdriver = "not initalized"
|
||||
|
||||
secrets = self.config.get("secrets", {})
|
||||
# APIs and service configurations
|
||||
secrets = self.config.get("secrets", {})
|
||||
|
||||
# google sheets config
|
||||
self.gsheets_client = gspread.service_account(
|
||||
filename=secrets.get("google_api", {}).get("filename", 'service_account.json')
|
||||
)
|
||||
|
||||
# facebook config
|
||||
self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
|
||||
|
||||
# s3 config
|
||||
if "s3" in secrets:
|
||||
s3 = secrets["s3"]
|
||||
self.s3_config = S3Config(
|
||||
|
@ -86,6 +91,7 @@ class Config:
|
|||
else:
|
||||
logger.debug(f"'s3' key not present in the {self.config_file=}")
|
||||
|
||||
# wayback machine config
|
||||
if "wayback" in secrets:
|
||||
self.wayback_config = WaybackConfig(
|
||||
key=secrets["wayback"]["key"],
|
||||
|
@ -94,6 +100,7 @@ class Config:
|
|||
else:
|
||||
logger.debug(f"'wayback' key not present in the {self.config_file=}")
|
||||
|
||||
# telethon config
|
||||
if "telegram" in secrets:
|
||||
self.telegram_config = TelegramConfig(
|
||||
api_id=secrets["telegram"]["api_id"],
|
||||
|
@ -102,10 +109,6 @@ class Config:
|
|||
else:
|
||||
logger.debug(f"'telegram' key not present in the {self.config_file=}")
|
||||
|
||||
self.gsheets_client = gspread.service_account(
|
||||
filename=secrets.get("google_api", {}).get("filename", 'service_account.json')
|
||||
)
|
||||
|
||||
del self.config["secrets"]
|
||||
|
||||
def get_argument_parser(self):
|
||||
|
@ -133,6 +136,17 @@ class Config:
|
|||
return LocalStorage(self.folder)
|
||||
raise f"storage {self.storage} not yet implemented"
|
||||
|
||||
def destroy_webdriver(self):
|
||||
if self.webdriver is not None:
|
||||
self.webdriver.quit()
|
||||
|
||||
def recreate_webdriver(self):
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
self.webdriver = webdriver.Firefox(options=options)
|
||||
self.webdriver.set_window_size(1400, 2000)
|
||||
self.webdriver.set_page_load_timeout(self.selenium_timeout)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return json.dumps({
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
import datetime
|
||||
import argparse
|
||||
import requests
|
||||
import shutil
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from dotenv import load_dotenv
|
||||
from selenium import webdriver
|
||||
import traceback
|
||||
|
||||
import archivers
|
||||
from storages import S3Storage, S3Config
|
||||
from utils import GWorksheet, mkdir_if_not_exists
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
driver.set_window_size(1400, 2000)
|
||||
|
||||
s3_config = S3Config(
|
||||
bucket=os.getenv('DO_BUCKET'),
|
||||
region=os.getenv('DO_SPACES_REGION'),
|
||||
key=os.getenv('DO_SPACES_KEY'),
|
||||
secret=os.getenv('DO_SPACES_SECRET'),
|
||||
folder="temp"
|
||||
)
|
||||
s3_client = S3Storage(s3_config)
|
||||
telegram_config = archivers.TelegramConfig(
|
||||
api_id=os.getenv('TELEGRAM_API_ID'),
|
||||
api_hash=os.getenv('TELEGRAM_API_HASH')
|
||||
)
|
||||
|
||||
archiver = archivers.TelethonArchiver(s3_client, driver, telegram_config)
|
||||
|
||||
URLs = [
|
||||
# "https://t.me/c/1226032830/24864",
|
||||
# "https://t.me/truexanewsua/32650",
|
||||
"https://t.me/informatsia_obstanovka/5239",
|
||||
# "https://t.me/informatsia_obstanovka/5240",
|
||||
# "https://t.me/informatsia_obstanovka/5241",
|
||||
# "https://t.me/informatsia_obstanovka/5242"
|
||||
]
|
||||
|
||||
|
||||
for url in URLs:
|
||||
print(url)
|
||||
print(archiver.download(url, False))
|
Ładowanie…
Reference in New Issue