kopia lustrzana https://github.com/bellingcat/auto-archiver
144 wiersze
5.6 KiB
YAML
144 wiersze
5.6 KiB
YAML
---
|
|
secrets:
|
|
# needed if you use storage=s3
|
|
s3:
|
|
# contains S3 info on region, bucket, key and secret
|
|
region: reg1
|
|
bucket: my-bucket
|
|
key: "s3 API key"
|
|
secret: "s3 API secret"
|
|
# use region format like such
|
|
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
|
# endpoint_url: "https://s3.{region}.amazonaws.com"
|
|
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
|
|
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
|
# if private:true S3 urls will not be readable online
|
|
private: false
|
|
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
|
key_path: random
|
|
|
|
# needed if you use storage=gd
|
|
google_drive:
|
|
# To authenticate with google you have two options (1. service account OR 2. OAuth token)
|
|
|
|
# 1. service account - storage space will count towards the developer account
|
|
# filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
|
|
# service_account: "service_account.json"
|
|
|
|
# 2. OAuth token - storage space will count towards the owner of the GDrive folder
|
|
# (only 1. or 2. - if both specified then this 2. takes precedence)
|
|
# needs write access on the server so refresh flow works
|
|
# To get the token, run the file `create_update_test_oauth_token.py`
|
|
# you can edit that file if you want a different token filename, default is "gd-token.json"
|
|
oauth_token_filename: "gd-token.json"
|
|
|
|
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
|
|
|
|
# needed if you use storage=local
|
|
local:
|
|
# local path to save files in
|
|
save_to: "./local_archive"
|
|
|
|
wayback:
|
|
# to get credentials visit https://archive.org/account/s3.php
|
|
key: your API key
|
|
secret: your API secret
|
|
|
|
telegram:
|
|
# to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
|
|
api_id: your API key, see
|
|
api_hash: your API hash
|
|
# optional, but allows access to more content such as large videos, talk to @botfather
|
|
bot_token: your bot-token
|
|
# optional, defaults to ./anon, records the telegram login session for future usage
|
|
session_file: "secrets/anon"
|
|
|
|
# twitter configuration - API V2 only
|
|
# if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead
|
|
twitter:
|
|
# either bearer_token only
|
|
bearer_token: ""
|
|
# OR all of the below
|
|
consumer_key: ""
|
|
consumer_secret: ""
|
|
access_token: ""
|
|
access_secret: ""
|
|
|
|
# vkontakte (vk.com) credentials
|
|
vk:
|
|
username: "phone number or email"
|
|
password: "password"
|
|
# optional, defaults to ./vk_config.v2.json, records VK login session for future usage
|
|
session_file: "secrets/vk_config.v2.json"
|
|
|
|
# instagram credentials
|
|
instagram:
|
|
username: "username"
|
|
password: "password"
|
|
session_file: "instaloader.session" # <- default value
|
|
|
|
google_sheets:
|
|
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
|
|
service_account: "service_account.json"
|
|
|
|
facebook:
|
|
# optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
|
|
cookie: ""
|
|
execution:
|
|
# can be overwritten with CMD --sheet=
|
|
sheet: your-sheet-name
|
|
|
|
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
|
|
# worksheet_allow and worksheet_block can be single values or lists
|
|
# if worksheet_allow is specified, worksheet_block is ignored
|
|
# worksheet_allow:
|
|
# - Sheet1
|
|
# - "Sheet 2"
|
|
# worksheet_block: BlockedSheet
|
|
|
|
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
|
header: 1
|
|
# which storage to use, can be overwritten with CMD --storage=
|
|
storage: s3
|
|
# defaults to false, when true will try to avoid duplicate URL archives
|
|
check_if_exists: true
|
|
|
|
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
|
|
# hash_algorithm: SHA-256
|
|
|
|
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
|
selenium:
|
|
# values under 10s might mean screenshots fail to grab screenshot
|
|
timeout_seconds: 120
|
|
window_width: 1400
|
|
window_height: 2000
|
|
|
|
# optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
|
|
# browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage
|
|
browsertrix:
|
|
enabled: true # defaults to false
|
|
profile: "./browsertrix/crawls/profile.tar.gz"
|
|
timeout_seconds: 120 # defaults to 90s
|
|
# puts execution logs into /logs folder, defaults to false
|
|
save_logs: true
|
|
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
|
# url and status are the only columns required to be present in the google sheet
|
|
column_names:
|
|
url: link
|
|
status: archive status
|
|
archive: archive location
|
|
# use this column to override default location data
|
|
folder: folder
|
|
date: archive date
|
|
thumbnail: thumbnail
|
|
thumbnail_index: thumbnail index
|
|
timestamp: upload timestamp
|
|
title: upload title
|
|
duration: duration
|
|
screenshot: screenshot
|
|
hash: hash
|
|
wacz: wacz
|
|
# if you want the replaypage to work, make sure to allow CORS on your bucket, see https://replayweb.page/docs/embedding#cors-restrictions
|
|
replaywebpage: replaywebpage
|
|
|