fediverse.space/backend/lib/backend/crawler/crawlers/nodeinfo.ex

141 wiersze
4.1 KiB
Elixir

defmodule Backend.Crawler.Crawlers.Nodeinfo do
@moduledoc """
This module is slightly different from the other crawlers. It's run before all the others and its
result is included in theirs.
"""
alias Backend.Crawler.ApiCrawler
require Logger
import Backend.Util
import Backend.Crawler.Util
@behaviour ApiCrawler
@impl ApiCrawler
def allows_crawling?(domain) do
[
".well-known/nodeinfo"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
@impl ApiCrawler
def is_instance_type?(_domain, _nodeinfo) do
# This crawler is used slightly differently from the others -- we always check for nodeinfo.
true
end
@impl ApiCrawler
def crawl(domain, _curr_result) do
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
nodeinfo
else
_other -> ApiCrawler.get_default()
end
end
@spec get_nodeinfo_url(String.t()) ::
{:ok, String.t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_nodeinfo_url(domain) do
case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
{:ok, response} -> {:ok, process_nodeinfo_url(response)}
{:error, err} -> {:error, err}
end
end
@spec process_nodeinfo_url(any()) :: String.t()
defp process_nodeinfo_url(response) do
response
|> Map.get("links")
|> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
|> Kernel.hd()
|> Map.get("href")
end
@spec get_nodeinfo(String.t()) :: ApiCrawler.t()
defp get_nodeinfo(nodeinfo_url) do
case get_and_decode(nodeinfo_url) do
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
{:error, err} -> {:error, err}
end
end
@spec process_nodeinfo(any()) :: ApiCrawler.t()
defp process_nodeinfo(nodeinfo) do
user_count = get_in(nodeinfo, ["usage", "users", "total"])
if is_above_user_threshold?(user_count) do
# Both of these are used, depending on the server implementation
description =
[
get_in(nodeinfo, ["metadata", "description"]),
get_in(nodeinfo, ["metadata", "nodeDescription"])
]
|> Enum.filter(fn d -> d != nil end)
|> Enum.at(0)
type = nodeinfo |> get_in(["software", "name"]) |> String.downcase() |> String.to_atom()
Map.merge(
ApiCrawler.get_default(),
%{
description: description,
user_count: user_count,
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: type,
version: get_in(nodeinfo, ["software", "version"]),
federation_restrictions: get_federation_restrictions(nodeinfo)
}
)
else
Map.merge(
ApiCrawler.get_default(),
%{
user_count: user_count
}
)
end
end
@spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
defp is_compatible_nodeinfo_version?(schema_url) do
version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
Enum.member?(["1.0", "1.1", "2.0"], version)
end
@spec get_federation_restrictions(any()) :: [ApiCrawler.federation_restriction()]
defp get_federation_restrictions(nodeinfo) do
mrf_simple = get_in(nodeinfo, ["metadata", "federation", "mrf_simple"])
quarantined_domains = get_in(nodeinfo, ["metadata", "federation", "quarantined_instances"])
quarantined_domains =
if quarantined_domains == nil do
[]
else
Enum.map(quarantined_domains, fn domain -> {domain, "quarantine"} end)
end
if mrf_simple != nil do
mrf_simple
|> Map.take([
"report_removal",
"reject",
"media_removal",
"media_nsfw",
"federated_timeline_removal",
"banner_removal",
"avatar_removal",
"accept"
])
|> Enum.flat_map(fn {type, domains} ->
# credo:disable-for-next-line Credo.Check.Refactor.Nesting
Enum.map(domains, fn domain -> {domain, type} end)
end)
|> Enum.concat(quarantined_domains)
else
quarantined_domains
end
end
end