kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge pull request #64 from bellingcat/dev
commit
683f2d7500
|
@ -19,4 +19,6 @@ local_archive/
|
|||
vk_config*.json
|
||||
gd-token.json
|
||||
credentials.json
|
||||
secrets/*
|
||||
secrets/*
|
||||
browsertrix/*
|
||||
browsertrix-tmp/*
|
|
@ -18,6 +18,8 @@ You also need:
|
|||
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
|
||||
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
|
||||
6. If you would like to take archival WACZ snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
|
||||
in addition to screenshots you will need to install [Docker](https://www.docker.com/).
|
||||
|
||||
### Configuration file
|
||||
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import os, datetime, shutil, hashlib, time, requests, re, mimetypes
|
||||
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
|
||||
from dataclasses import dataclass
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
|
@ -10,6 +10,7 @@ from selenium.common.exceptions import TimeoutException
|
|||
from selenium.webdriver.common.by import By
|
||||
from slugify import slugify
|
||||
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
|
@ -24,16 +25,18 @@ class ArchiveResult:
|
|||
title: str = None
|
||||
timestamp: datetime.datetime = None
|
||||
screenshot: str = None
|
||||
wacz: str = None
|
||||
hash: str = None
|
||||
|
||||
class Archiver(ABC):
|
||||
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
|
||||
name = "default"
|
||||
retry_regex = r"retrying at (\d+)$"
|
||||
|
||||
def __init__(self, storage: Storage, driver):
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
self.storage = storage
|
||||
self.driver = driver
|
||||
self.driver = config.webdriver
|
||||
self.hash_algorithm = config.hash_algorithm
|
||||
self.browsertrix = config.browsertrix_config
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
@ -162,11 +165,11 @@ class Archiver(ABC):
|
|||
def get_hash(self, filename):
|
||||
with open(filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
|
||||
logger.debug(f'Hash algorithm is {self.hash_algorithm}')
|
||||
|
||||
if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
|
||||
elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
|
||||
if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
|
||||
elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||
else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
|
||||
|
||||
return hash.hexdigest()
|
||||
|
||||
|
@ -195,8 +198,54 @@ class Archiver(ABC):
|
|||
logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
self.driver.save_screenshot(filename)
|
||||
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
|
||||
return self.storage.get_cdn_url(key)
|
||||
|
||||
def get_wacz(self, url):
|
||||
logger.debug(f"getting wacz for {url}")
|
||||
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
|
||||
collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
|
||||
|
||||
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
"-it",
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.browsertrix.timeout_seconds)
|
||||
]
|
||||
|
||||
if not os.path.isdir(browsertrix_home):
|
||||
os.mkdir(browsertrix_home)
|
||||
|
||||
if self.browsertrix.profile:
|
||||
shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
|
||||
cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True)
|
||||
except Exception as e:
|
||||
logger.error(f"WACZ generation failed: {e}")
|
||||
return
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
'ACL': 'public-read', 'ContentType': 'application/zip'})
|
||||
|
||||
# clean up the local browsertrix files
|
||||
try:
|
||||
shutil.rmtree(browsertrix_home)
|
||||
except PermissionError:
|
||||
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
|
||||
|
||||
return self.storage.get_cdn_url(key)
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ class TelegramArchiver(Archiver):
|
|||
url += "?embed=1"
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
|
@ -46,7 +47,7 @@ class TelegramArchiver(Archiver):
|
|||
time_elements = s.find_all('time')
|
||||
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
|
||||
|
||||
video_url = video.get('src')
|
||||
video_id = video_url.split('/')[-1].split('?')[0]
|
||||
|
@ -85,4 +86,4 @@ class TelegramArchiver(Archiver):
|
|||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -7,7 +7,7 @@ from telethon.errors import ChannelInvalidError
|
|||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import TelethonConfig
|
||||
from configs import Config
|
||||
from utils import getattr_or
|
||||
|
||||
|
||||
|
@ -15,11 +15,12 @@ class TelethonArchiver(Archiver):
|
|||
name = "telethon"
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TelethonConfig):
|
||||
super().__init__(storage, driver)
|
||||
if config:
|
||||
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
|
||||
self.bot_token = config.bot_token
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
if config.telegram_config:
|
||||
c = config.telegram_config
|
||||
self.client = TelegramClient("./anon", c.api_id, c.api_hash)
|
||||
self.bot_token = c.bot_token
|
||||
|
||||
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
|
||||
"""
|
||||
|
@ -73,6 +74,7 @@ class TelethonArchiver(Archiver):
|
|||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
if len(media_posts) > 0:
|
||||
key = self.get_html_key(url)
|
||||
|
@ -80,7 +82,7 @@ class TelethonArchiver(Archiver):
|
|||
if check_if_exists and self.storage.exists(key):
|
||||
# only s3 storage supports storage.exists as not implemented on gd
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
key_thumb, thumb_index = None, None
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
|
@ -119,7 +121,7 @@ class TelethonArchiver(Archiver):
|
|||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index)
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
|
||||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot)
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -48,6 +48,7 @@ class TiktokArchiver(Archiver):
|
|||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
try: os.remove(filename)
|
||||
except FileNotFoundError:
|
||||
|
@ -57,7 +58,7 @@ class TiktokArchiver(Archiver):
|
|||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
|
||||
timestamp=timestamp, hash=hash, screenshot=screenshot)
|
||||
timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
except tiktok_downloader.Except.InvalidUrl as e:
|
||||
status = 'Invalid URL'
|
||||
|
|
|
@ -5,7 +5,7 @@ from loguru import logger
|
|||
from pytwitter import Api
|
||||
|
||||
from storages.base_storage import Storage
|
||||
from configs import TwitterApiConfig
|
||||
from configs import Config
|
||||
from .base_archiver import ArchiveResult
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
|
||||
|
@ -13,14 +13,15 @@ from .twitter_archiver import TwitterArchiver
|
|||
class TwitterApiArchiver(TwitterArchiver):
|
||||
name = "twitter_api"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
|
||||
super().__init__(storage, driver)
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
c = config.twitter_config
|
||||
|
||||
if config.bearer_token:
|
||||
self.api = Api(bearer_token=config.bearer_token)
|
||||
elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret:
|
||||
if c.bearer_token:
|
||||
self.api = Api(bearer_token=c.bearer_token)
|
||||
elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret:
|
||||
self.api = Api(
|
||||
consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret)
|
||||
consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret)
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if not hasattr(self, "api"):
|
||||
|
@ -69,5 +70,6 @@ class TwitterApiArchiver(TwitterArchiver):
|
|||
}, ensure_ascii=False, indent=4)
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
|
||||
|
|
|
@ -39,8 +39,9 @@ class TwitterArchiver(Archiver):
|
|||
if tweet.media is None:
|
||||
logger.debug(f'No media found, archiving tweet text only')
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
urls = []
|
||||
|
||||
|
@ -59,8 +60,9 @@ class TwitterArchiver(Archiver):
|
|||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
|
||||
|
||||
def download_alternative(self, url, tweet_id):
|
||||
# https://stackoverflow.com/a/71867055/6196010
|
||||
|
@ -83,8 +85,9 @@ class TwitterArchiver(Archiver):
|
|||
|
||||
timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"])
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
|
||||
|
||||
def choose_variant(self, variants):
|
||||
# choosing the highest quality possible
|
||||
|
|
|
@ -5,7 +5,7 @@ from vk_url_scraper import VkScraper, DateTimeEncoder
|
|||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import VkConfig
|
||||
from configs import Config
|
||||
|
||||
|
||||
class VkArchiver(Archiver):
|
||||
|
@ -17,10 +17,10 @@ class VkArchiver(Archiver):
|
|||
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: VkConfig):
|
||||
super().__init__(storage, driver)
|
||||
if config != None:
|
||||
self.vks = VkScraper(config.username, config.password)
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
if config.vk_config != None:
|
||||
self.vks = VkScraper(config.vk_config.username, config.vk_config.password)
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if not hasattr(self, "vks") or self.vks is None:
|
||||
|
@ -70,4 +70,5 @@ class VkArchiver(Archiver):
|
|||
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
|
||||
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
||||
screenshot = self.get_screenshot(url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title)
|
||||
wacz = self.get_wacz(url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
|
||||
|
|
|
@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import WaybackConfig
|
||||
from configs import Config
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
|
@ -15,9 +15,9 @@ class WaybackArchiver(Archiver):
|
|||
"""
|
||||
name = "wayback"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
||||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.config = config
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super(WaybackArchiver, self).__init__(storage, config)
|
||||
self.config = config.wayback_config
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
|
@ -28,6 +28,8 @@ class WaybackArchiver(Archiver):
|
|||
if url in self.seen_urls: return self.seen_urls[url]
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
logger.debug(f"POSTing {url=} to web.archive.org")
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
|
@ -37,10 +39,10 @@ class WaybackArchiver(Archiver):
|
|||
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||
return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
|
||||
return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
|
||||
|
||||
if 'job_id' not in r.json() and 'message' in r.json():
|
||||
return self.custom_retry(r.json(), screenshot=screenshot)
|
||||
return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
logger.debug(f"GETting status for {job_id=} on {url=}")
|
||||
|
@ -59,11 +61,11 @@ class WaybackArchiver(Archiver):
|
|||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
|
||||
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
|
||||
|
||||
status_json = status_r.json()
|
||||
if status_json['status'] != 'success':
|
||||
return self.custom_retry(status_json, screenshot=screenshot)
|
||||
return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
||||
|
||||
|
@ -75,8 +77,7 @@ class WaybackArchiver(Archiver):
|
|||
title = 'Could not get title'
|
||||
except:
|
||||
title = "Could not get title"
|
||||
screenshot = self.get_screenshot(url)
|
||||
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
|
||||
return self.seen_urls[url]
|
||||
|
||||
def custom_retry(self, json_data, **kwargs):
|
||||
|
|
|
@ -6,15 +6,16 @@ from loguru import logger
|
|||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from storages import Storage
|
||||
from configs import Config
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "youtube_dl"
|
||||
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||
|
||||
def __init__(self, storage: Storage, driver, fb_cookie):
|
||||
super().__init__(storage, driver)
|
||||
self.fb_cookie = fb_cookie
|
||||
def __init__(self, storage: Storage, config: Config):
|
||||
super().__init__(storage, config)
|
||||
self.fb_cookie = config.facebook_cookie
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
netloc = self.get_netloc(url)
|
||||
|
@ -93,6 +94,7 @@ class YoutubeDLArchiver(Archiver):
|
|||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
# get duration
|
||||
duration = info.get('duration')
|
||||
|
@ -113,4 +115,4 @@ class YoutubeDLArchiver(Archiver):
|
|||
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)
|
||||
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -2,6 +2,7 @@ import os, datetime, traceback, random, tempfile
|
|||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
from urllib.parse import quote
|
||||
|
||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
|
||||
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
||||
|
@ -11,7 +12,7 @@ from storages import Storage
|
|||
random.seed()
|
||||
|
||||
|
||||
def update_sheet(gw, row, result: ArchiveResult):
|
||||
def update_sheet(gw, row, url, result: ArchiveResult):
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
|
@ -30,6 +31,8 @@ def update_sheet(gw, row, result: ArchiveResult):
|
|||
batch_if_valid('duration', result.duration, str(result.duration))
|
||||
batch_if_valid('screenshot', result.screenshot)
|
||||
batch_if_valid('hash', result.hash)
|
||||
batch_if_valid('wacz', result.wacz)
|
||||
batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
||||
|
||||
if result.timestamp is not None:
|
||||
if type(result.timestamp) == int:
|
||||
|
@ -104,14 +107,14 @@ def process_sheet(c: Config):
|
|||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
||||
TiktokArchiver(storage, c.webdriver),
|
||||
TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
|
||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TwitterArchiver(storage, c.webdriver),
|
||||
VkArchiver(storage, c.webdriver, c.vk_config),
|
||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
||||
TelethonArchiver(storage, c),
|
||||
TiktokArchiver(storage, c),
|
||||
TwitterApiArchiver(storage, c),
|
||||
YoutubeDLArchiver(storage, c),
|
||||
TelegramArchiver(storage, c),
|
||||
TwitterArchiver(storage, c),
|
||||
VkArchiver(storage, c),
|
||||
WaybackArchiver(storage, c)
|
||||
]
|
||||
|
||||
for archiver in active_archivers:
|
||||
|
@ -136,7 +139,7 @@ def process_sheet(c: Config):
|
|||
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
|
||||
|
||||
if result:
|
||||
update_sheet(gw, row, result)
|
||||
update_sheet(gw, row, url, result)
|
||||
else:
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
except KeyboardInterrupt:
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class BrowsertrixConfig:
|
||||
profile: str
|
||||
timeout_seconds: str
|
|
@ -1,6 +1,4 @@
|
|||
|
||||
import argparse, yaml, json
|
||||
from archivers.base_archiver import Archiver
|
||||
import argparse, yaml, json, os
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from selenium import webdriver
|
||||
|
@ -13,6 +11,7 @@ from .telethon_config import TelethonConfig
|
|||
from .selenium_config import SeleniumConfig
|
||||
from .vk_config import VkConfig
|
||||
from .twitter_api_config import TwitterApiConfig
|
||||
from .browsertrix_config import BrowsertrixConfig
|
||||
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
|
||||
|
||||
|
||||
|
@ -82,7 +81,16 @@ class Config:
|
|||
)
|
||||
self.webdriver = "not initialized"
|
||||
|
||||
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
|
||||
# browsertrix config
|
||||
browsertrix_configs = execution.get("browsertrix", {})
|
||||
if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
|
||||
browsertrix_profile = os.path.abspath(browsertrix_profile)
|
||||
self.browsertrix_config = BrowsertrixConfig(
|
||||
profile=browsertrix_profile,
|
||||
timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
|
||||
)
|
||||
|
||||
self.hash_algorithm = execution.get("hash_algorithm", "SHA-256")
|
||||
|
||||
# ---------------------- SECRETS - APIs and service configurations
|
||||
secrets = self.config.get("secrets", {})
|
||||
|
@ -208,6 +216,7 @@ class Config:
|
|||
update the folder in each of the storages
|
||||
"""
|
||||
self.folder = folder
|
||||
logger.info(f"setting folder to {folder}")
|
||||
# s3
|
||||
if hasattr(self, "s3_config"): self.s3_config.folder = folder
|
||||
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
|
||||
|
@ -263,7 +272,8 @@ class Config:
|
|||
"storage": self.storage,
|
||||
"header": self.header,
|
||||
"check_if_exists": self.check_if_exists,
|
||||
"hash_algorithm": Archiver.HASH_ALGORITHM,
|
||||
"hash_algorithm": self.hash_algorithm,
|
||||
"browsertrix_config": asdict(self.browsertrix_config),
|
||||
"save_logs": self.save_logs,
|
||||
"selenium_config": asdict(self.selenium_config),
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
|
|
|
@ -9,6 +9,7 @@ secrets:
|
|||
secret: "s3 API secret"
|
||||
# use region format like such
|
||||
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||
# endpoint_url: "https://s3.{region}.amazonaws.com"
|
||||
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
|
@ -101,6 +102,11 @@ execution:
|
|||
timeout_seconds: 120
|
||||
window_width: 1400
|
||||
window_height: 2000
|
||||
|
||||
# optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
|
||||
browsertrix:
|
||||
profile: "./browsertrix/crawls/profile.tar.gz"
|
||||
timeout_seconds: 90 # defaults to 90s
|
||||
# puts execution logs into /logs folder, defaults to false
|
||||
save_logs: true
|
||||
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
||||
|
@ -119,4 +125,7 @@ execution:
|
|||
duration: duration
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
# if you want the replaypage to work, make sure to allow CORS on your bucket
|
||||
replaywebpage: replaywebpage
|
||||
|
||||
|
|
|
@ -67,9 +67,14 @@ class S3Storage(Storage):
|
|||
return False
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
if self.private:
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
else:
|
||||
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
|
||||
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
|
||||
if 'ContentType' not in extra_args:
|
||||
try:
|
||||
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to get mimetype for {key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
||||
|
|
|
@ -20,7 +20,9 @@ class GWorksheet:
|
|||
'title': 'upload title',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash'
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
}
|
||||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
|
|
Ładowanie…
Reference in New Issue