kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge remote-tracking branch 'origin/main' into refactor-configs
commit
f6e8da34b8
|
@ -5,4 +5,6 @@ DO_BUCKET=
|
||||||
INTERNET_ARCHIVE_S3_KEY=
|
INTERNET_ARCHIVE_S3_KEY=
|
||||||
INTERNET_ARCHIVE_S3_SECRET=
|
INTERNET_ARCHIVE_S3_SECRET=
|
||||||
TELEGRAM_API_ID=
|
TELEGRAM_API_ID=
|
||||||
TELEGRAM_API_HASH=
|
TELEGRAM_API_HASH=
|
||||||
|
|
||||||
|
FACEBOOK_COOKIE=cookie: datr= xxxx
|
|
@ -41,7 +41,7 @@ class TwitterArchiver(Archiver):
|
||||||
elif type(media) == Gif:
|
elif type(media) == Gif:
|
||||||
urls.append(media.variants[0].url)
|
urls.append(media.variants[0].url)
|
||||||
elif type(media) == Photo:
|
elif type(media) == Photo:
|
||||||
urls.append(media.fullUrl)
|
urls.append(media.fullUrl.replace('name=large', 'name=orig'))
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Could not get media URL of {media}")
|
logger.warning(f"Could not get media URL of {media}")
|
||||||
|
|
||||||
|
|
|
@ -11,11 +11,15 @@ class YoutubeDLArchiver(Archiver):
|
||||||
name = "youtube_dl"
|
name = "youtube_dl"
|
||||||
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||||
|
|
||||||
|
def __init__(self, storage: Storage, driver, fb_cookie):
|
||||||
|
super().__init__(storage, driver)
|
||||||
|
self.fb_cookie = fb_cookie
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
netloc = self.get_netloc(url)
|
netloc = self.get_netloc(url)
|
||||||
if netloc in ['facebook.com', 'www.facebook.com'] and os.getenv('FB_COOKIE'):
|
if netloc in ['facebook.com', 'www.facebook.com']:
|
||||||
logger.info('Using Facebook cookie')
|
logger.debug('Using Facebook cookie')
|
||||||
yt_dlp.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
yt_dlp.utils.std_headers['cookie'] = self.fb_cookie
|
||||||
|
|
||||||
ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts)
|
ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts)
|
||||||
cdn_url = None
|
cdn_url = None
|
||||||
|
@ -30,6 +34,13 @@ class YoutubeDLArchiver(Archiver):
|
||||||
if info.get('is_live', False):
|
if info.get('is_live', False):
|
||||||
logger.warning("Live streaming media, not archiving now")
|
logger.warning("Live streaming media, not archiving now")
|
||||||
return ArchiveResult(status="Streaming media")
|
return ArchiveResult(status="Streaming media")
|
||||||
|
if 'twitter.com' in netloc:
|
||||||
|
if 'https://twitter.com/' in info['webpage_url']:
|
||||||
|
logger.info('Found https://twitter.com/ in the download url from Twitter')
|
||||||
|
else:
|
||||||
|
logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
if check_if_exists:
|
if check_if_exists:
|
||||||
if 'entries' in info:
|
if 'entries' in info:
|
||||||
|
|
|
@ -12,6 +12,13 @@ import traceback
|
||||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
|
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
|
||||||
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
||||||
from configs import Config
|
from configs import Config
|
||||||
|
import sys
|
||||||
|
|
||||||
|
logger.add("logs/1trace.log", level="TRACE")
|
||||||
|
logger.add("logs/2info.log", level="INFO")
|
||||||
|
logger.add("logs/3success.log", level="SUCCESS")
|
||||||
|
logger.add("logs/4warning.log", level="WARNING")
|
||||||
|
logger.add("logs/5error.log", level="ERROR")
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
@ -55,8 +62,8 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||||
|
|
||||||
# loop through worksheets to check
|
# loop through worksheets to check
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
logger.info(f'Opening worksheet {ii}: "{wks.title}" header={c.header}')
|
logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}')
|
||||||
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
|
gw = GWorksheet(wks, header_row=header, columns=columns)
|
||||||
|
|
||||||
if not gw.col_exists('url'):
|
if not gw.col_exists('url'):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
@ -80,6 +87,7 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||||
YoutubeDLArchiver(storage, c.webdriver),
|
YoutubeDLArchiver(storage, c.webdriver),
|
||||||
TwitterArchiver(storage, c.webdriver),
|
TwitterArchiver(storage, c.webdriver),
|
||||||
WaybackArchiver(storage, c.webdriver)
|
WaybackArchiver(storage, c.webdriver)
|
||||||
|
archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')),
|
||||||
]
|
]
|
||||||
|
|
||||||
# loop through rows in worksheet
|
# loop through rows in worksheet
|
||||||
|
@ -91,7 +99,17 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||||
gw.set_cell(row, 'status', 'Archive in progress')
|
gw.set_cell(row, 'status', 'Archive in progress')
|
||||||
|
|
||||||
url = expand_url(url)
|
url = expand_url(url)
|
||||||
|
|
||||||
|
|
||||||
|
# make a new driver so each spreadsheet row is idempotent
|
||||||
|
options = webdriver.FirefoxOptions()
|
||||||
|
options.headless = True
|
||||||
|
options.set_preference('network.protocol-handler.external.tg', False)
|
||||||
|
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
driver.set_window_size(1400, 2000)
|
||||||
|
# in seconds, telegram screenshots catch which don't come back
|
||||||
|
driver.set_page_load_timeout(120)
|
||||||
for archiver in active_archivers:
|
for archiver in active_archivers:
|
||||||
logger.debug(f'Trying {archiver} on row {row}')
|
logger.debug(f'Trying {archiver} on row {row}')
|
||||||
|
|
||||||
|
@ -112,15 +130,17 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||||
f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
||||||
result.status = archiver.name + \
|
result.status = archiver.name + \
|
||||||
": " + str(result.status)
|
": " + str(result.status)
|
||||||
|
# get rid of driver so can reload on next row
|
||||||
|
driver.quit()
|
||||||
if result:
|
if result:
|
||||||
update_sheet(gw, row, result)
|
update_sheet(gw, row, result)
|
||||||
else:
|
else:
|
||||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||||
logger.success(f'Finshed worksheet {wks.title}')
|
logger.success(f'Finshed worksheet {wks.title}')
|
||||||
|
|
||||||
|
@logger.catch
|
||||||
def main():
|
def main():
|
||||||
|
logger.debug(f'Passed args:{sys.argv}')
|
||||||
c = Config()
|
c = Config()
|
||||||
c.parse()
|
c.parse()
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue