Revert "add metadata endpoint"

This reverts commit 82153b283b3be8e7a48da92a6d02d05ef28e98c5.
2019-08-09 16:59:51 +00:00 · 2019-08-09 16:59:51 +00:00 · 3320e050c8
commit 3320e050c8
--- a/backend/lib/backend/crawler/api_crawler.ex
+++ b/backend/lib/backend/crawler/api_crawler.ex
@ -11,10 +11,12 @@ defmodule Backend.Crawler.ApiCrawler do
  * Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
  """
  alias Backend.Crawler.Crawlers.Nodeinfo
  # {domain_mentioned, count}
  @type instance_interactions :: %{String.t() => integer}
-  @type instance_type :: :mastodon | :pleroma | :gab | :misskey
+  @type instance_type :: :mastodon | :pleroma | :gab | :misskey | :gnusocial
  defstruct [
    :version,
@ -30,8 +32,8 @@ defmodule Backend.Crawler.ApiCrawler do
  @type t() :: %__MODULE__{
          version: String.t(),
          description: String.t(),
-          user_count: integer,
+          user_count: integer | nil,
-          status_count: integer,
+          status_count: integer | nil,
          peers: [String.t()],
          interactions: instance_interactions,
          statuses_seen: integer,
@ -40,8 +42,9 @@ defmodule Backend.Crawler.ApiCrawler do
  @doc """
  Check whether the instance at the given domain is of the type that this ApiCrawler implements.
  Arguments are the instance domain and the nodeinfo results.
  """
-  @callback is_instance_type?(String.t()) :: boolean()
+  @callback is_instance_type?(String.t(), Nodeinfo.t()) :: boolean()
  @doc """
  Check whether the instance allows crawling according to its robots.txt or otherwise.
@ -50,6 +53,7 @@ defmodule Backend.Crawler.ApiCrawler do
  @doc """
  Crawl the instance at the given domain.
  Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
  """
-  @callback crawl(String.t()) :: t()
+  @callback crawl(String.t(), Nodeinfo.t()) :: t()
 end
--- a/backend/lib/backend/crawler/crawler.ex
+++ b/backend/lib/backend/crawler/crawler.ex
@ -4,7 +4,7 @@ defmodule Backend.Crawler do
  """
  alias __MODULE__
-  alias Backend.Crawler.Crawlers.{Mastodon, Misskey}
+  alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
  alias Backend.Crawler.ApiCrawler
  alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
  import Ecto.Query
@ -16,8 +16,8 @@ defmodule Backend.Crawler do
    :domain,
    # a list of ApiCrawlers that will be attempted
    :api_crawlers,
    :found_api?,
    :allows_crawling?,
    :found_api?,
    :result,
    :error
  ]
@ -25,8 +25,8 @@ defmodule Backend.Crawler do
  @type t() :: %__MODULE__{
          domain: String.t(),
          api_crawlers: [ApiCrawler.t()],
          found_api?: boolean,
          allows_crawling?: boolean,
          found_api?: boolean,
          result: ApiCrawler.t() | nil,
          error: String.t() | nil
        }
@ -37,16 +37,18 @@ defmodule Backend.Crawler do
    state = %Crawler{
      domain: domain,
      api_crawlers: [],
      found_api?: false,
      allows_crawling?: true,
      found_api?: false,
      result: nil,
      error: nil
    }
    state
-    # register APICrawlers here
+    # These crawlers are run in the order they're registered. Nodeinfo should be the first one.
    |> register(Nodeinfo)
    |> register(Mastodon)
    |> register(Misskey)
    |> register(GnuSocial)
    # go!
    |> crawl()
    |> save()
@ -56,33 +58,47 @@ defmodule Backend.Crawler do
  # Adds a new ApiCrawler that run/1 will check.
  defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
-    Map.put(state, :api_crawlers, [api_crawler | crawlers])
+    Map.put(state, :api_crawlers, crawlers ++ [api_crawler])
  end
  # Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
  # If so, crawls it. If not, continues with the tail of the api_crawlers list.
  defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
    Logger.debug("Found no compatible API for #{domain}")
-    Map.put(state, :found_api?, false)
+    state
  end
-  defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do
+  # Nodeinfo is distinct from other crawlers in that
-    if curr.is_instance_type?(domain) do
+  # a) it should always be run first
  # b) it passes the results on to the next crawlers (e.g. user_count)
  defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
    with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
      Logger.debug("Found nodeinfo for #{domain}.")
      result = Map.merge(nodeinfo, %{peers: [], interactions: %{}, statuses_seen: 0})
      crawl(%Crawler{state | result: result, found_api?: true, api_crawlers: remaining_crawlers})
    else
      _ ->
        Logger.debug("Did not find nodeinfo for #{domain}.")
        crawl(%Crawler{state | api_crawlers: remaining_crawlers})
    end
  end
  defp crawl(
         %Crawler{domain: domain, result: result, api_crawlers: [curr | remaining_crawlers]} =
           state
       ) do
    if curr.is_instance_type?(domain, result) do
      Logger.debug("Found #{curr} instance")
      state = Map.put(state, :found_api?, true)
      if curr.allows_crawling?(domain) do
        try do
-          %Crawler{state | result: curr.crawl(domain), api_crawlers: []}
+          %Crawler{state | result: curr.crawl(domain, result), found_api?: true}
        rescue
          e in HTTPoison.Error ->
            Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
          e in Jason.DecodeError ->
            Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
          e in _ ->
            Map.put(state, :error, "Unknown error: " <> inspect(e))
        end
      else
        Logger.debug("#{domain} does not allow crawling.")
@ -99,9 +115,9 @@ defmodule Backend.Crawler do
  defp save(%Crawler{
         domain: domain,
         result: result,
         found_api?: true,
         error: nil,
-         allows_crawling?: true
+         allows_crawling?: true,
         found_api?: true
       }) do
    now = get_now()
@ -240,7 +256,7 @@ defmodule Backend.Crawler do
      cond do
        not allows_crawling -> "robots.txt"
        error == nil -> "no api found"
-        true -> "unknown error"
+        true -> error
      end
    # The "+1" is this error!
@ -250,25 +266,25 @@ defmodule Backend.Crawler do
      |> Map.get(:crawl_error_count)
      |> Kernel.+(1)
-    # The crawl interval grows exponentially at first but never goes above 72 hours
+    # The crawl interval grows exponentially at first but never goes above 24 hours
    crawl_interval_mins =
-      min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320)
+      min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440)
    next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
-    Repo.transaction(fn ->
+    Repo.insert!(
-      Repo.insert!(
+      %Instance{
-        %Instance{
+        domain: domain,
-          domain: domain,
+        base_domain: get_base_domain(domain),
-          base_domain: get_base_domain(domain),
+        crawl_error: error,
-          crawl_error: error,
+        crawl_error_count: error_count,
-          crawl_error_count: error_count,
+        next_crawl: next_crawl,
-          next_crawl: next_crawl
+        updated_at: now
-        },
+      },
-        on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]},
+      on_conflict:
-        conflict_target: :domain
+        {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl, :updated_at]},
-      )
+      conflict_target: :domain
-    end)
+    )
    Appsignal.increment_counter("crawler.failure", 1)
  end
--- a/backend/lib/backend/crawler/crawlers/gnu_social.ex
+++ b/backend/lib/backend/crawler/crawlers/gnu_social.ex
@ -0,0 +1,178 @@
 defmodule Backend.Crawler.Crawlers.GnuSocial do
  alias Backend.Crawler.ApiCrawler
  alias Backend.Crawler.Crawlers.Nodeinfo
  import Backend.Crawler.Util
  import Backend.Util
  require Logger
  @behaviour ApiCrawler
  @impl ApiCrawler
  def is_instance_type?(_domain, nodeinfo_result) do
    nodeinfo_result != nil and Map.get(nodeinfo_result, :instance_type) == :gnusocial
  end
  @impl ApiCrawler
  def allows_crawling?(domain) do
    [
      "/api/statuses/public_timeline.json"
    ]
    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
    |> urls_are_crawlable?()
  end
  @impl ApiCrawler
  def crawl(domain, nodeinfo_result) do
    if nodeinfo_result |> Map.get(:user_count) |> is_above_user_threshold?() do
      crawl_large_instance(domain, nodeinfo_result)
    else
      nodeinfo_result
    end
  end
  @spec crawl_large_instance(String.t(), Nodeinfo.t()) :: ApiCrawler.t()
  defp crawl_large_instance(domain, nodeinfo_result) do
    status_datetime_threshold =
      NaiveDateTime.utc_now()
      |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
    # Don't get any statuses older than this
    min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
    {interactions, statuses_seen} = get_interactions(domain, min_timestamp)
    Map.merge(nodeinfo_result, %{
      interactions: interactions,
      statuses_seen: statuses_seen,
      peers: []
    })
  end
  @spec get_interactions(
          String.t(),
          NaiveDateTime.t(),
          String.t() | nil,
          ApiCrawler.instance_interactions(),
          integer()
        ) :: {ApiCrawler.instance_interactions(), integer()}
  defp get_interactions(
         domain,
         min_timestamp,
         max_id \\ nil,
         interactions \\ %{},
         statuses_seen \\ 0
       ) do
    endpoint = "https://#{domain}/api/statuses/public_timeline.json"
    endpoint =
      if max_id != nil do
        endpoint <> "?max_id=#{max_id}"
      else
        endpoint
      end
    Logger.debug("Crawling #{endpoint}")
    statuses = get_and_decode!(endpoint)
    # Filter to statuses that are in the correct timeframe
    filtered_statuses =
      statuses
      |> Enum.filter(fn s ->
        s["created_at"]
        |> parse_timestamp()
        |> is_after?(min_timestamp)
      end)
    if length(filtered_statuses) > 0 do
      # Filter down further to statuses that a) aren't faves and b) aren't from #nobot users
      eligible_statuses =
        filtered_statuses |> Enum.filter(fn s -> not is_fave?(s) and not has_nobot?(s) end)
      # get statuses that are eligible (i.e. users don't have #nobot in their profile), have mentions, and are not faves
      interactions =
        eligible_statuses
        |> statuses_to_interactions()
        |> merge_count_maps(interactions)
      statuses_seen =
        eligible_statuses
        |> Kernel.length()
        |> Kernel.+(statuses_seen)
      oldest_status = Enum.at(filtered_statuses, -1)
      oldest_status_datetime =
        oldest_status
        |> Map.get("created_at")
        |> parse_timestamp()
      if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
           statuses_seen < get_config(:status_count_limit) and
           length(filtered_statuses) == length(statuses) do
        get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
      else
        {interactions, statuses_seen}
      end
    else
      {interactions, statuses_seen}
    end
  end
  @spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
  defp statuses_to_interactions(statuses) do
    statuses
    |> Enum.filter(fn status -> is_mention?(status) end)
    |> Enum.map(fn status -> extract_mentions_from_status(status) end)
    |> Enum.reduce(%{}, fn map, acc ->
      Map.merge(acc, map)
    end)
  end
  # Checks whether the status contains one or more mentions
  @spec is_mention?(any()) :: boolean()
  defp is_mention?(%{"attentions" => []}) do
    false
  end
  defp is_mention?(_status) do
    true
  end
  @spec is_fave?(any()) :: boolean()
  defp is_fave?(status) do
    uri_elements = status |> Map.get("uri") |> String.split(":")
    Enum.member?(uri_elements, "fave")
  end
  @spec has_nobot?(any()) :: boolean()
  defp has_nobot?(status) do
    case get_in(status, ["user", "description"]) do
      nil ->
        false
      description ->
        description
        |> String.downcase()
        |> String.contains?("nobot")
    end
  end
  @spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
  defp extract_mentions_from_status(status) do
    status["attentions"]
    |> Enum.map(fn mention -> get_domain(mention["profileurl"]) end)
    |> Enum.reduce(%{}, fn domain, acc ->
      Map.update(acc, domain, 1, &(&1 + 1))
    end)
  end
  # Parses the messed-up time format that GNU social uses
  # Like seriously, it's 2019, why *wouldn't* you use iso8601?
  @spec parse_timestamp(String.t()) :: NaiveDateTime.t()
  defp parse_timestamp(timestamp) do
    timestamp
    |> Timex.parse!("{WDshort} {Mshort} {0D} {h24}:{0m}:{0s} {0Z} {YYYY}")
    |> Timex.to_naive_datetime()
  end
 end
--- a/backend/lib/backend/crawler/crawlers/mastodon.ex
+++ b/backend/lib/backend/crawler/crawlers/mastodon.ex
@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  @behaviour ApiCrawler
  @impl ApiCrawler
-  def is_instance_type?(domain) do
+  def is_instance_type?(domain, result) do
-    case get("https://#{domain}/api/v1/instance") do
+    # We might already know that this is a Pleroma instance from nodeinfo
-      {:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false
+    if result != nil and Map.get(result, :instance_type) == :pleroma do
-      {:error, _error} -> false
+      true
    else
      case get_and_decode("https://#{domain}/api/v1/instance") do
        {:ok, %{"title" => _title}} -> true
        _other -> false
      end
    end
  end
@ -26,8 +31,8 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  end
  @impl ApiCrawler
-  def crawl(domain) do
+  def crawl(domain, _current_result) do
-    instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body)
+    instance = get_and_decode!("https://#{domain}/api/v1/instance")
    user_count = get_in(instance, ["stats", "user_count"])
    if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
@ -51,12 +56,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
  @spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
  defp crawl_large_instance(domain, instance) do
-    # servers may not publish peers
+    peers = get_peers(domain)
    peers =
      case get("https://#{domain}/api/v1/instance/peers") do
        {:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
        {:error, _error} -> []
      end
    Logger.debug("Found #{length(peers)} peers.")
@ -124,15 +124,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    Logger.debug("Crawling #{endpoint}")
-    statuses =
+    statuses = get_and_decode!(endpoint)
      endpoint
      |> get!()
      |> Map.get(:body)
      |> Jason.decode!()
    filtered_statuses =
      statuses
-      |> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end)
+      |> Enum.filter(fn s ->
        s["created_at"]
        |> NaiveDateTime.from_iso8601!()
        |> is_after?(min_timestamp)
      end)
    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
@ -166,12 +166,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
    end
  end
-  # To check if the endpoint works as expected
+  defp get_peers(domain) do
-  @spec has_title?(String.t()) :: boolean
+    # servers may not publish peers
-  defp has_title?(body) do
+    case get_and_decode("https://#{domain}/api/v1/instance/peers") do
-    case Jason.decode(body) do
+      {:ok, peers} -> peers
-      {:ok, decoded} -> Map.has_key?(decoded, "title")
+      {:error, _err} -> []
      {:error, _error} -> false
    end
  end
--- a/backend/lib/backend/crawler/crawlers/misskey.ex
+++ b/backend/lib/backend/crawler/crawlers/misskey.ex
@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do
  require Logger
  @impl ApiCrawler
-  def is_instance_type?(domain) do
+  def is_instance_type?(domain, result) do
-    case get_version_and_description(domain) do
+    # We may already know that this is a Misskey instance from nodeinfo
-      {:ok, _} -> true
+    if result != nil and Map.get(result, :instance_type) == :misskey do
-      {:error, _} -> false
+      true
    else
      case get_version_and_description(domain) do
        {:ok, _} -> true
        {:error, _} -> false
      end
    end
  end
@ -27,11 +32,9 @@ defmodule Backend.Crawler.Crawlers.Misskey do
  end
  @impl ApiCrawler
-  def crawl(domain) do
+  def crawl(domain, _result) do
-    with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do
+    with {:ok, %{"originalUsersCount" => user_count, "originalNotesCount" => status_count}} <-
-      %{"originalUsersCount" => user_count, "originalNotesCount" => status_count} =
+           post_and_decode("https://#{domain}/api/stats") do
        Jason.decode!(stats_body)
      if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
        crawl_large_instance(domain, user_count, status_count)
      else
@ -107,15 +110,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do
    Logger.debug("Crawling #{endpoint} with untilId=#{until_id}")
-    statuses =
+    statuses = post_and_decode!(endpoint, Jason.encode!(params))
      endpoint
      |> post!(Jason.encode!(params))
      |> Map.get(:body)
      |> Jason.decode!()
    filtered_statuses =
      statuses
-      |> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end)
+      |> Enum.filter(fn s ->
        s["createdAt"]
        |> NaiveDateTime.from_iso8601!()
        |> is_after?(min_timestamp)
      end)
    if length(filtered_statuses) > 0 do
      # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
@ -151,35 +154,22 @@ defmodule Backend.Crawler.Crawlers.Misskey do
  end
  @spec get_version_and_description(String.t()) ::
-          {:ok, {String.t(), String.t()}} | {:error, String.t()}
+          {:ok, {String.t(), String.t()}} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  defp get_version_and_description(domain) do
-    case post("https://#{domain}/api/meta") do
+    case post_and_decode("https://#{domain}/api/meta") do
-      {:ok, %{status_code: 200, body: body}} ->
+      {:ok, %{"version" => version, "description" => description}} ->
-        case Jason.decode(body) do
+        {:ok, {version, description}}
          {:ok, decoded} ->
            {:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}}
-          {:error, _error} ->
+      {:error, err} ->
-            {:error, "invalid response"}
+        {:error, err}
        end
      _ ->
        {:error, "unsuccesful request"}
    end
  end
  @spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()}
  defp get_peers(domain) do
-    case get("https://#{domain}/api/v1/instance/peers") do
+    case get_and_decode("https://#{domain}/api/v1/instance/peers") do
-      {:ok, response} ->
+      {:ok, peers} -> {:ok, peers}
-        with %{status_code: 200, body: body} <- response do
+      {:error, _} -> {:ok, []}
          Jason.decode(body)
        else
          _ -> {:ok, []}
        end
      {:error, _} ->
        {:ok, []}
    end
  end
--- a/backend/lib/backend/crawler/crawlers/nodeinfo.ex
+++ b/backend/lib/backend/crawler/crawlers/nodeinfo.ex
@ -0,0 +1,117 @@
 defmodule Backend.Crawler.Crawlers.Nodeinfo do
  alias Backend.Crawler.ApiCrawler
  require Logger
  import Backend.Util
  import Backend.Crawler.Util
  @moduledoc """
  This module is slightly different from the other crawlers.
  It doesn't implement the ApiCrawler spec because it isn't run as a self-contained crawler.
  Instead, it's run before all the other crawlers.
  This is to get the user count. Some servers don't publish this in other places (e.g. GNU Social, PeerTube) so we need
  nodeinfo to know whether it's a personal instance or not.
  """
  defstruct [
    :description,
    :user_count,
    :status_count,
    :instance_type,
    :version
  ]
  @type t() :: %__MODULE__{
          description: String.t(),
          user_count: integer,
          status_count: integer,
          instance_type: ApiCrawler.instance_type(),
          version: String.t()
        }
  @spec allows_crawling?(String.t()) :: boolean()
  def allows_crawling?(domain) do
    [
      ".well-known/nodeinfo"
    ]
    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
    |> urls_are_crawlable?()
  end
  @spec crawl(String.t()) :: {:ok, t()} | {:error, nil}
  def crawl(domain) do
    with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
         {:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
      {:ok, nodeinfo}
    else
      _other -> {:error, nil}
    end
  end
  @spec get_nodeinfo_url(String.t()) ::
          {:ok, String.t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  defp get_nodeinfo_url(domain) do
    case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
      {:ok, response} -> {:ok, process_nodeinfo_url(response)}
      {:error, err} -> {:error, err}
    end
  end
  @spec process_nodeinfo_url(any()) :: String.t()
  defp process_nodeinfo_url(response) do
    response
    |> Map.get("links")
    |> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
    |> Kernel.hd()
    |> Map.get("href")
  end
  @spec get_nodeinfo(String.t()) ::
          {:ok, t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  defp get_nodeinfo(nodeinfo_url) do
    case get_and_decode(nodeinfo_url) do
      {:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
      {:error, err} -> {:error, err}
    end
  end
  @spec process_nodeinfo(any()) :: t()
  defp process_nodeinfo(nodeinfo) do
    user_count = get_in(nodeinfo, ["usage", "users", "total"])
    if is_above_user_threshold?(user_count) do
      # Both of these are used, depending on the server implementation
      description =
        [
          get_in(nodeinfo, ["metadata", "description"]),
          get_in(nodeinfo, ["metadata", "nodeDescription"])
        ]
        |> Enum.filter(fn d -> d != nil end)
        |> Enum.at(0)
      type = nodeinfo |> get_in(["software", "name"]) |> String.downcase() |> String.to_atom()
      %__MODULE__{
        description: description,
        user_count: user_count,
        status_count: get_in(nodeinfo, ["usage", "localPosts"]),
        instance_type: type,
        version: get_in(nodeinfo, ["software", "version"])
      }
    else
      %{
        description: nil,
        user_count: user_count,
        status_count: nil,
        instance_type: nil,
        version: nil
      }
    end
  end
  @spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
  defp is_compatible_nodeinfo_version?(schema_url) do
    version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
    Enum.member?(["1.0", "1.1", "2.0"], version)
  end
 end
--- a/backend/lib/backend/crawler/stale_instance_manager.ex
+++ b/backend/lib/backend/crawler/stale_instance_manager.ex
@ -54,7 +54,7 @@ defmodule Backend.Crawler.StaleInstanceManager do
    stale_domains =
      Instance
      |> select([i], i.domain)
-      |> where([i], i.next_crawl < ^now)
+      |> where([i], i.next_crawl < ^now and not i.opt_out)
      |> Repo.all()
      |> MapSet.new()
--- a/backend/lib/backend/crawler/util.ex
+++ b/backend/lib/backend/crawler/util.ex
@ -8,27 +8,19 @@ defmodule Backend.Crawler.Util do
  # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
  @spec get_domain(String.t()) :: String.t()
  def get_domain(url) do
-    String.slice(url, 8..-1)
+    [_match, domain] = Regex.run(~r/https?:\/\/([\w.-]+)\/.*/, url)
-    |> String.split("/")
+    domain
    |> Enum.at(0)
  end
-  @spec is_http_200?(HTTPoison.Response.t()) :: boolean
+  @doc """
-  def is_http_200?(%{status_code: 200}) do
+  Returns true if the first argument is after the second.
-    true
+  """
-  end
+  @spec is_after?(NaiveDateTime.t(), NaiveDateTime.t() | nil) :: boolean()
  def is_http_200?(_) do
    false
  end
  @spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
  def is_after?(timestamp, threshold) do
    if threshold == nil do
      true
    else
      timestamp
      |> NaiveDateTime.from_iso8601!()
      # :second is the granularity used in the database
      |> NaiveDateTime.truncate(:second)
      |> NaiveDateTime.compare(threshold)
@ -36,49 +28,6 @@ defmodule Backend.Crawler.Util do
    end
  end
  def get(url) do
    # TODO: add version number to user agent?
    HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
      hackney: [pool: :crawler],
      recv_timeout: 15000,
      timeout: 15000
    )
  end
  @spec get!(binary) :: %{
          :__struct__ => HTTPoison.AsyncResponse | HTTPoison.Response,
          optional(:body) => any,
          optional(:headers) => [any],
          optional(:id) => reference,
          optional(:request) => HTTPoison.Request.t(),
          optional(:request_url) => any,
          optional(:status_code) => integer
        }
  def get!(url) do
    # TODO: add version number to user agent?
    HTTPoison.get!(url, [{"User-Agent", get_config(:user_agent)}],
      hackney: [pool: :crawler],
      recv_timeout: 15000,
      timeout: 15000
    )
  end
  def post(url, body \\ "") do
    HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
      hackney: [pool: :crawler],
      recv_timeout: 15000,
      timeout: 15000
    )
  end
  def post!(url, body \\ "") do
    HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}],
      hackney: [pool: :crawler],
      recv_timeout: 15000,
      timeout: 15000
    )
  end
  @spec urls_are_crawlable?([String.t()]) :: boolean()
  def urls_are_crawlable?(urls) do
    user_agent = get_config(:user_agent)
--- a/backend/lib/backend/util.ex
+++ b/backend/lib/backend/util.ex
@ -145,4 +145,54 @@ defmodule Backend.Util do
  def convert_keys_to_atoms(map) do
    map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
  end
  @doc """
  Gets and decodes a HTTP response.
  """
  @spec get_and_decode(String.t()) ::
          {:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  def get_and_decode(url) do
    case HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
           hackney: [pool: :crawler],
           recv_timeout: 15000,
           timeout: 15000
         ) do
      {:ok, %{status_code: 200, body: body}} -> Jason.decode(body)
      {:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
      {:error, err} -> {:error, err}
    end
  end
  @spec get_and_decode!(String.t()) :: any()
  def get_and_decode!(url) do
    case get_and_decode(url) do
      {:ok, decoded} -> decoded
      {:error, error} -> raise error
    end
  end
  @doc """
  POSTS to a HTTP endpoint and decodes the JSON response.
  """
  @spec post_and_decode(String.t(), String.t()) ::
          {:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
  def post_and_decode(url, body \\ "") do
    case HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
           hackney: [pool: :crawler],
           recv_timeout: 15000,
           timeout: 15000
         ) do
      {:ok, %{status_code: 200, body: response_body}} -> Jason.decode(response_body)
      {:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
      {:error, err} -> {:error, err}
    end
  end
  @spec post_and_decode!(String.t(), String.t()) :: any()
  def post_and_decode!(url, body \\ "") do
    case post_and_decode(url, body) do
      {:ok, decoded} -> decoded
      {:error, error} -> raise error
    end
  end
 end
--- a/backend/lib/backend_web/controllers/admin_login_controller.ex
+++ b/backend/lib/backend_web/controllers/admin_login_controller.ex
@ -14,10 +14,7 @@ defmodule BackendWeb.AdminLoginController do
    # TODO: this assumes mastodon/pleroma API
    cleaned_domain = clean_domain(domain)
-    instance_data =
+    instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
      HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
      |> Map.get(:body)
      |> Jason.decode!()
    render(conn, "show.json", instance_data: instance_data, cleaned_domain: cleaned_domain)
  end
@ -25,10 +22,7 @@ defmodule BackendWeb.AdminLoginController do
  def create(conn, %{"domain" => domain, "type" => type}) do
    cleaned_domain = clean_domain(domain)
-    instance_data =
+    instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
      HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
      |> Map.get(:body)
      |> Jason.decode!()
    error =
      cond do
--- a/frontend/src/components/atoms/InstanceType.tsx
+++ b/frontend/src/components/atoms/InstanceType.tsx
@ -3,7 +3,7 @@ import { IconNames } from "@blueprintjs/icons";
 import React from "react";
 import { QUALITATIVE_COLOR_SCHEME } from "../../constants";
 import { typeColorScheme } from "../../types";
-import { capitalize } from "../../util";
+import { getTypeDisplayString } from "../../util";
 interface IInstanceTypeProps {
  type: string;
@ -15,7 +15,7 @@ interface IInstanceTypeProps {
 */
 const InstanceType: React.FC<IInstanceTypeProps> = ({ type, colorAfterName }) => {
  const idx = typeColorScheme.values.indexOf(type);
-  const name = " " + capitalize(type);
+  const name = " " + getTypeDisplayString(type);
  return (
    <>
      {!!colorAfterName && name}
--- a/frontend/src/components/organisms/SearchFilters.tsx
+++ b/frontend/src/components/organisms/SearchFilters.tsx
@ -4,7 +4,7 @@ import React, { MouseEvent } from "react";
 import styled from "styled-components";
 import { INSTANCE_TYPES } from "../../constants";
 import { getSearchFilterDisplayValue, ISearchFilter } from "../../searchFilters";
-import { capitalize } from "../../util";
+import { getTypeDisplayString } from "../../util";
 const SearchFilterContainer = styled.div`
  margin: 10px 0 0 0;
@ -30,7 +30,7 @@ const SearchFilters: React.FC<ISearchFiltersProps> = ({ selectedFilters, selectF
  const handleSelectInstanceType = (e: MouseEvent<HTMLElement>) => {
    const field = "type";
    const relation = "eq";
-    const value = e.currentTarget.innerText.toLowerCase();
+    const value = e.currentTarget.innerText.toLowerCase().replace(" ", "");
    const filter: ISearchFilter = {
      displayValue: getSearchFilterDisplayValue(field, relation, value),
      field,
@ -43,7 +43,7 @@ const SearchFilters: React.FC<ISearchFiltersProps> = ({ selectedFilters, selectF
    <Menu>
      <MenuItem icon={IconNames.SYMBOL_CIRCLE} text="Instance type" disabled={hasInstanceTypeFilter}>
        {INSTANCE_TYPES.map(t => (
-          <MenuItem key={t} text={capitalize(t)} onClick={handleSelectInstanceType} />
+          <MenuItem key={t} text={getTypeDisplayString(t)} onClick={handleSelectInstanceType} />
        ))}
      </MenuItem>
    </Menu>
--- a/frontend/src/constants.tsx
+++ b/frontend/src/constants.tsx
@ -40,4 +40,4 @@ export interface IInstanceDomainPath {
 }
 // We could also extract the values from the server response, but this would slow things down...
-export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey"];
+export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey", "gnusocial"];
--- a/frontend/src/util.ts
+++ b/frontend/src/util.ts
@ -68,3 +68,13 @@ export const getBuckets = (min: number, max: number, steps: number, exponential:
    return range(min, max, bucketSize);
  }
 };
 const typeToDisplay = {
  gnusocial: "GNU Social"
 };
 export const getTypeDisplayString = (key: string) => {
  if (key in typeToDisplay) {
    return typeToDisplay[key];
  }
  return capitalize(key);
 };