fediverse.space/backend/lib/backend/crawler/crawlers/gnu_social.ex

defmodule Backend.Crawler.Crawlers.GnuSocial do
  @moduledoc """
  Crawler for GNU Social servers.
  """
  alias Backend.Crawler.ApiCrawler
  alias Backend.Crawler.Crawlers.Nodeinfo
  import Backend.Crawler.Util
  import Backend.Util
  require Logger

  @behaviour ApiCrawler

  @impl ApiCrawler
  def is_instance_type?(domain, nodeinfo_result) do
    if nodeinfo_result != nil do
      Map.get(nodeinfo_result, :instance_type) == :gnusocial
    else
      case get_and_decode("https://#{domain}/api/statuses/public_timeline.json") do
        {:ok, statuses} -> is_list(statuses)
        {:error, _other} -> false
      end
    end
  end

  @impl ApiCrawler
  def allows_crawling?(domain) do
    [
      "/api/statuses/public_timeline.json"
    ]
    |> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
    |> urls_are_crawlable?()
  end

  @impl ApiCrawler
  def crawl(domain, nodeinfo_result) do
    if nodeinfo_result == nil or
         nodeinfo_result |> Map.get(:user_count) |> is_above_user_threshold?() do
      crawl_large_instance(domain, nodeinfo_result)
    else
      nodeinfo_result
    end
  end

  @spec crawl_large_instance(String.t(), Nodeinfo.t()) :: ApiCrawler.t()
  defp crawl_large_instance(domain, nodeinfo_result) do
    status_datetime_threshold =
      NaiveDateTime.utc_now()
      |> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)

    # Don't get any statuses older than this
    min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)

    {interactions, statuses_seen} = get_interactions(domain, min_timestamp)

    if nodeinfo_result != nil do
      Map.merge(nodeinfo_result, %{
        interactions: interactions,
        statuses_seen: statuses_seen,
        peers: []
      })
    else
      %{
        version: nil,
        description: nil,
        user_count: nil,
        status_count: nil,
        peers: [],
        interactions: interactions,
        statuses_seen: statuses_seen,
        instance_type: :gnusocial
      }
    end
  end

  @spec get_interactions(
          String.t(),
          NaiveDateTime.t(),
          String.t() | nil,
          ApiCrawler.instance_interactions(),
          integer()
        ) :: {ApiCrawler.instance_interactions(), integer()}
  defp get_interactions(
         domain,
         min_timestamp,
         max_id \\ nil,
         interactions \\ %{},
         statuses_seen \\ 0
       ) do
    endpoint = "https://#{domain}/api/statuses/public_timeline.json"

    endpoint =
      if max_id != nil do
        endpoint <> "?max_id=#{max_id}"
      else
        endpoint
      end

    Logger.debug("Crawling #{endpoint}")

    statuses = get_and_decode!(endpoint)

    # Filter to statuses that are in the correct timeframe
    filtered_statuses =
      statuses
      |> Enum.filter(fn s ->
        s["created_at"]
        |> parse_timestamp()
        |> is_after?(min_timestamp)
      end)

    if length(filtered_statuses) > 0 do
      # Filter down further to statuses that a) aren't faves and b) aren't from #nobot users
      eligible_statuses =
        filtered_statuses |> Enum.filter(fn s -> not is_fave?(s) and not has_nobot?(s) end)

      # get statuses that are eligible (i.e. users don't have #nobot in their profile), have mentions, and are not faves
      interactions =
        eligible_statuses
        |> statuses_to_interactions()
        |> merge_count_maps(interactions)

      statuses_seen =
        eligible_statuses
        |> Kernel.length()
        |> Kernel.+(statuses_seen)

      oldest_status = Enum.at(filtered_statuses, -1)

      oldest_status_datetime =
        oldest_status
        |> Map.get("created_at")
        |> parse_timestamp()

      if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
           statuses_seen < get_config(:status_count_limit) and
           length(filtered_statuses) == length(statuses) do
        get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
      else
        {interactions, statuses_seen}
      end
    else
      {interactions, statuses_seen}
    end
  end

  @spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
  defp statuses_to_interactions(statuses) do
    statuses
    |> Enum.filter(fn status -> is_mention?(status) end)
    |> Enum.map(fn status -> extract_mentions_from_status(status) end)
    |> Enum.reduce(%{}, fn map, acc ->
      Map.merge(acc, map)
    end)
  end

  # Checks whether the status contains one or more mentions
  @spec is_mention?(any()) :: boolean()
  defp is_mention?(%{"attentions" => []}) do
    false
  end

  defp is_mention?(_status) do
    true
  end

  @spec is_fave?(any()) :: boolean()
  defp is_fave?(status) do
    uri_elements = status |> Map.get("uri") |> String.split(":")
    Enum.member?(uri_elements, "fave")
  end

  @spec has_nobot?(any()) :: boolean()
  defp has_nobot?(status) do
    case get_in(status, ["user", "description"]) do
      nil ->
        false

      description ->
        description
        |> String.downcase()
        |> String.contains?("nobot")
    end
  end

  @spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
  defp extract_mentions_from_status(status) do
    status["attentions"]
    |> Enum.map(fn mention -> get_domain(mention["profileurl"]) end)
    |> Enum.reduce(%{}, fn domain, acc ->
      Map.update(acc, domain, 1, &(&1 + 1))
    end)
  end

  # Parses the messed-up time format that GNU social uses
  # Like seriously, it's 2019, why *wouldn't* you use iso8601?
  @spec parse_timestamp(String.t()) :: NaiveDateTime.t()
  defp parse_timestamp(timestamp) do
    timestamp
    |> Timex.parse!("{WDshort} {Mshort} {0D} {h24}:{0m}:{0s} {0Z} {YYYY}")
    |> Timex.to_naive_datetime()
  end
end