--- secrets: # needed if you use storage=s3 s3: # contains S3 info on region, bucket, key and secret region: reg1 bucket: my-bucket key: "s3 API key" secret: "s3 API secret" # use region format like such endpoint_url: "https://{region}.digitaloceanspaces.com" # endpoint_url: "https://s3.{region}.amazonaws.com" #use bucket, region, and key (key is the archived file path generated when executing) format like such as: cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" # if private:true S3 urls will not be readable online private: false # with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config key_path: random # needed if you use storage=gd google_drive: # To authenticate with google you have two options (1. service account OR 2. OAuth token) # 1. service account - storage space will count towards the developer account # filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json" # service_account: "service_account.json" # 2. OAuth token - storage space will count towards the owner of the GDrive folder # (only 1. or 2. - if both specified then this 2. takes precedence) # needs write access on the server so refresh flow works # To get the token, run the file `create_update_test_oauth_token.py` # you can edit that file if you want a different token filename, default is "gd-token.json" oauth_token_filename: "gd-token.json" root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX # needed if you use storage=local local: # local path to save files in save_to: "./local_archive" wayback: # to get credentials visit https://archive.org/account/s3.php key: your API key secret: your API secret telegram: # to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27 api_id: your API key, see api_hash: your API hash # optional, but allows access to more content such as large videos, talk to @botfather bot_token: your bot-token # optional, defaults to ./anon, records the telegram login session for future usage session_file: "secrets/anon" # twitter configuration - API V2 only # if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead twitter: # either bearer_token only bearer_token: "" # OR all of the below consumer_key: "" consumer_secret: "" access_token: "" access_secret: "" # vkontakte (vk.com) credentials vk: username: "phone number or email" password: "password" # optional, defaults to ./vk_config.v2.json, records VK login session for future usage session_file: "secrets/vk_config.v2.json" # instagram credentials instagram: username: "username" password: "password" session_file: "instaloader.session" # <- default value google_sheets: # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account service_account: "service_account.json" facebook: # optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx' cookie: "" execution: # can be overwritten with CMD --sheet= sheet: your-sheet-name # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet # worksheet_allow and worksheet_block can be single values or lists # if worksheet_allow is specified, worksheet_block is ignored # worksheet_allow: # - Sheet1 # - "Sheet 2" # worksheet_block: BlockedSheet # which row of your tabs contains the header, can be overwritten with CMD --header= header: 1 # which storage to use, can be overwritten with CMD --storage= storage: s3 # defaults to false, when true will try to avoid duplicate URL archives check_if_exists: true # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256) # hash_algorithm: SHA-256 # optional configurations for the selenium browser that takes screenshots, these are the defaults selenium: # values under 10s might mean screenshots fail to grab screenshot timeout_seconds: 120 window_width: 1400 window_height: 2000 # optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) # browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage browsertrix: enabled: true # defaults to false profile: "./browsertrix/crawls/profile.tar.gz" timeout_seconds: 120 # defaults to 90s # puts execution logs into /logs folder, defaults to false save_logs: true # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" # url and status are the only columns required to be present in the google sheet column_names: url: link status: archive status archive: archive location # use this column to override default location data folder: folder date: archive date thumbnail: thumbnail thumbnail_index: thumbnail index timestamp: upload timestamp title: upload title duration: duration screenshot: screenshot hash: hash wacz: wacz # if you want the replaypage to work, make sure to allow CORS on your bucket, see https://replayweb.page/docs/embedding#cors-restrictions replaywebpage: replaywebpage