auto-archiver/example.config.yaml

---
secrets:
  # needed if you use storage=s3
  s3:
    # contains S3 info on region, bucket, key and secret
    region: reg1
    bucket: my-bucket
    key: "s3 API key"
    secret: "s3 API secret"
    # use region format like such
    endpoint_url: "https://{region}.digitaloceanspaces.com"
    # endpoint_url: "https://s3.{region}.amazonaws.com"
    #use bucket, region, and key (key is the archived file path generated when executing) format like such as:
    cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
    # if private:true S3 urls will not be readable online
    private: false
    # with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
    key_path: random

  # needed if you use storage=gd
  google_drive:
    # To authenticate with google you have two options (1. service account OR 2. OAuth token)

    # 1. service account - storage space will count towards the developer account
    # filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
    # service_account: "service_account.json"

    # 2. OAuth token  - storage space will count towards the owner of the GDrive folder
    # (only 1. or 2. - if both specified then this 2. takes precedence)
    # needs write access on the server so refresh flow works
    # To get the token, run the file `create_update_test_oauth_token.py`
    # you can edit that file if you want a different token filename, default is "gd-token.json"
    oauth_token_filename: "gd-token.json"

    root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX

  # needed if you use storage=local
  local:
    # local path to save files in
    save_to: "./local_archive"

  wayback:
    # to get credentials visit https://archive.org/account/s3.php
    key: your API key
    secret: your API secret

  telegram:
    # to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
    api_id: your API key, see
    api_hash: your API hash
    # optional, but allows access to more content such as large videos, talk to @botfather
    bot_token: your bot-token
    # optional, defaults to ./anon, records the telegram login session for future usage
    session_file: "secrets/anon"

  # twitter configuration - API V2 only
  # if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead
  twitter:
    # either bearer_token only
    bearer_token: ""
    # OR all of the below
    consumer_key: ""
    consumer_secret: ""
    access_token: ""
    access_secret: ""

  # vkontakte (vk.com) credentials
  vk:
    username: "phone number or email"
    password: "password"
    # optional, defaults to ./vk_config.v2.json, records VK login session for future usage
    session_file: "secrets/vk_config.v2.json"

  # instagram  credentials
  instagram:
    username: "username"
    password: "password"
    session_file: "instaloader.session" # <- default value

  google_sheets:
    # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
    service_account: "service_account.json"

  facebook:
    # optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
    cookie: ""
execution:
  # can be overwritten with CMD --sheet=
  sheet: your-sheet-name

  # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
  # worksheet_allow and worksheet_block can be single values or lists
  # if worksheet_allow is specified, worksheet_block is ignored
  # worksheet_allow:
  #   - Sheet1
  #   - "Sheet 2"
  # worksheet_block: BlockedSheet

  # which row of your tabs contains the header, can be overwritten with CMD --header=
  header: 1
  # which storage to use, can be overwritten with CMD --storage=
  storage: s3
  # defaults to false, when true will try to avoid duplicate URL archives
  check_if_exists: true

  # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
  # hash_algorithm: SHA-256

  # optional configurations for the selenium browser that takes screenshots, these are the defaults
  selenium:
    # values under 10s might mean screenshots fail to grab screenshot
    timeout_seconds: 120
    window_width: 1400
    window_height: 2000

  # optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
  # browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage
  browsertrix:
    enabled: true # defaults to false
    profile: "./browsertrix/crawls/profile.tar.gz"
    timeout_seconds: 120 # defaults to 90s
  # puts execution logs into /logs folder, defaults to false
  save_logs: true
  # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
  # url and status are the only columns required to be present in the google sheet
  column_names:
    url: link
    status: archive status
    archive: archive location
    # use this column to override default location data
    folder: folder
    date: archive date
    thumbnail: thumbnail
    thumbnail_index: thumbnail index
    timestamp: upload timestamp
    title: upload title
    duration: duration
    screenshot: screenshot
    hash: hash
    wacz: wacz
    # if you want the replaypage to work, make sure to allow CORS on your bucket, see https://replayweb.page/docs/embedding#cors-restrictions
    replaywebpage: replaywebpage