fediverse.space/backend/lib/backend/crawler/crawlers/mastodon.ex

242 wiersze
7.1 KiB
Elixir

defmodule Backend.Crawler.Crawlers.Mastodon do
@moduledoc """
Crawler for the Mastodon API (used by Mastodon, its forks like Gab or Glitch, and Pleroma).
"""
require Logger
import Backend.Crawler.Util
import Backend.Util
alias Backend.Crawler.ApiCrawler
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(domain, result) do
# We might already know that this is a Pleroma instance from nodeinfo
if result != nil do
cond do
Map.get(result, :instance_type) == :pleroma -> true
Map.get(result, :instance_type) == :smithereen -> true
Map.get(result, :instance_type) == :mastodon -> true
Map.get(result, :instance_type) == :friendica -> false
true -> false
end
else
case get_and_decode("https://#{domain}/api/v1/instance") do
{:ok, %{"title" => _title}} -> true
_other -> false
end
end
end
@impl ApiCrawler
def allows_crawling?(domain) do
[
"/api/v1/instance",
"/api/v1/instance/peers",
"/api/v1/timelines/public"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
@impl ApiCrawler
def crawl(domain, nodeinfo) do
instance = get_and_decode!("https://#{domain}/api/v1/instance")
user_count = get_in(instance, ["stats", "user_count"])
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
Map.merge(nodeinfo, crawl_large_instance(domain, instance))
else
ApiCrawler.get_default()
|> Map.merge(nodeinfo)
|> Map.merge(%{
instance_type: get_instance_type(instance),
user_count: get_in(instance, ["stats", "user_count"])
})
end
end
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do
peers = get_peers(domain)
Logger.debug("Found #{length(peers)} peers.")
{interactions, statuses_seen} = get_interactions(domain)
Logger.debug(
"#{domain}: found #{interactions |> Map.values() |> Enum.reduce(0, fn count, acc -> count + acc end)} mentions in #{statuses_seen} statuses."
)
Map.merge(
Map.merge(
Map.take(instance, ["version", "description"]),
Map.take(instance["stats"], ["user_count", "status_count"])
)
|> convert_keys_to_atoms(),
%{
peers: peers,
interactions: interactions,
statuses_seen: statuses_seen,
instance_type: get_instance_type(instance)
}
)
end
@spec get_interactions(
String.t(),
String.t() | nil,
Calendar.naive_datetime() | nil,
ApiCrawler.instance_interactions(),
integer
) :: {ApiCrawler.instance_interactions(), integer}
defp get_interactions(
domain,
max_id \\ nil,
min_timestamp \\ nil,
interactions \\ %{},
statuses_seen \\ 0
) do
# If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the
# most recent status we have.
min_timestamp =
if statuses_seen == 0 do
get_last_crawl_timestamp(domain)
else
min_timestamp
end
endpoint = "https://#{domain}/api/v1/timelines/public?local=true"
endpoint =
if max_id do
endpoint <> "&max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
statuses = get_and_decode!(endpoint)
filtered_statuses =
statuses
|> Enum.filter(fn s ->
s["created_at"]
|> NaiveDateTime.from_iso8601!()
|> is_after?(min_timestamp)
end)
if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
interactions =
filtered_statuses
|> statuses_to_interactions()
|> merge_count_maps(interactions)
statuses_seen = statuses_seen + length(filtered_statuses)
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> (fn s -> s["created_at"] end).()
|> NaiveDateTime.from_iso8601!()
if NaiveDateTime.compare(oldest_status_datetime, status_datetime_threshold) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_interactions(domain, oldest_status["id"], min_timestamp, interactions, statuses_seen)
else
{interactions, statuses_seen}
end
else
{interactions, statuses_seen}
end
end
defp get_peers(domain) do
# servers may not publish peers
case get_and_decode("https://#{domain}/api/v1/instance/peers") do
{:ok, peers} -> peers
{:error, _err} -> []
end
end
# Checks whether the status contains one or more mentions
defp is_mention?(status) do
case status["mentions"] do
[] -> false
nil -> false
_ -> true
end
end
# Checks if the author of the status has "nobot" in their profile
defp has_nobot?(status) do
account = status["account"]
fields =
account["fields"]
|> Enum.map_join("", fn %{"name" => name, "value" => value} -> name <> value end)
# this also means that any users who mentioned ethnobotany in their profiles will be excluded lol ¯\_(ツ)_/¯
(account["note"] <> fields)
|> String.downcase()
|> String.contains?("nobot")
end
# This checks if the status
# a) contains one or more mentions, and
# b) that the person posting doesn't have "nobot" in their profile
defp is_eligible?(status) do
is_mention?(status) and not has_nobot?(status)
end
@spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
defp extract_mentions_from_status(status) do
status["mentions"]
|> Enum.map(fn mention -> get_domain(mention["url"]) end)
|> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1))
end)
end
@spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
defp statuses_to_interactions(statuses) do
statuses
|> Enum.filter(fn status -> is_eligible?(status) end)
|> Enum.map(fn status -> extract_mentions_from_status(status) end)
|> Enum.reduce(%{}, fn map, acc ->
Map.merge(acc, map)
end)
end
defp is_gab?(instance) do
title_is_gab = Map.get(instance, "title") |> String.downcase() == "gab social"
contact_account = Map.get(instance, "contact_account")
if contact_account != nil do
gab_keys = ["is_pro", "is_verified", "is_donor", "is_investor"]
has_gab_keys = gab_keys |> Enum.any?(&Map.has_key?(contact_account, &1))
title_is_gab or has_gab_keys
else
title_is_gab
end
end
defp get_instance_type(instance_stats) do
cond do
Map.get(instance_stats, "version") |> String.downcase() =~ "pleroma" -> :pleroma
Map.get(instance_stats, "version") |> String.downcase() =~ "smithereen" -> :smithereen
is_gab?(instance_stats) -> :gab
true -> :mastodon
end
end
end