fediverse.space/backend/lib/backend/crawler/crawlers/nodeinfo.ex

168 wiersze
4.8 KiB
Elixir
Czysty Zwykły widok Historia

defmodule Backend.Crawler.Crawlers.Nodeinfo do
@moduledoc """
2019-08-27 13:50:16 +00:00
This module is slightly different from the other crawlers. It's run before all the others and its
result is included in theirs.
"""
2019-08-27 13:50:16 +00:00
2019-08-21 12:30:47 +00:00
alias Backend.Crawler.ApiCrawler
2023-06-10 17:53:10 +00:00
alias Backend.Http
2019-08-21 12:30:47 +00:00
require Logger
import Backend.Util
import Backend.Crawler.Util
2019-08-27 13:50:16 +00:00
@behaviour ApiCrawler
2019-08-27 13:50:16 +00:00
@impl ApiCrawler
def allows_crawling?(domain) do
[
2023-06-10 17:53:10 +00:00
"/.well-known/nodeinfo"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
2019-08-27 13:50:16 +00:00
@impl ApiCrawler
def is_instance_type?(_domain, _nodeinfo) do
# This crawler is used slightly differently from the others -- we always check for nodeinfo.
true
end
@impl ApiCrawler
def crawl(domain, _curr_result) do
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
2019-08-27 13:50:16 +00:00
nodeinfo
else
2019-08-27 13:50:16 +00:00
_other -> ApiCrawler.get_default()
end
end
@spec get_nodeinfo_url(String.t()) ::
2023-06-10 17:53:10 +00:00
{:ok, String.t()} | {:error, Jason.DecodeError.t() | Http.Error.t() | :invalid_body}
defp get_nodeinfo_url(domain) do
2023-06-10 17:53:10 +00:00
with {:ok, response} <-
http_client().get_and_decode("https://#{domain}/.well-known/nodeinfo"),
{:ok, nodeinfo_url} <- process_nodeinfo_url(response) do
{:ok, nodeinfo_url}
else
{:error, error} -> {:error, error}
:error -> {:error, :invalid_body}
end
end
2023-06-10 17:53:10 +00:00
@spec process_nodeinfo_url(any()) :: {:ok, String.t()} | :error
defp process_nodeinfo_url(response) do
2023-06-10 17:53:10 +00:00
links =
response
|> Map.get("links", [])
|> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
if Enum.empty?(links) do
:error
else
href =
links
|> Kernel.hd()
|> Map.get("href")
{:ok, href}
end
end
2019-08-27 13:50:16 +00:00
@spec get_nodeinfo(String.t()) :: ApiCrawler.t()
defp get_nodeinfo(nodeinfo_url) do
2023-06-10 17:53:10 +00:00
case http_client().get_and_decode(nodeinfo_url) do
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
{:error, err} -> {:error, err}
end
end
2019-08-27 13:50:16 +00:00
@spec process_nodeinfo(any()) :: ApiCrawler.t()
defp process_nodeinfo(nodeinfo) do
user_count = get_in(nodeinfo, ["usage", "users", "total"])
if is_above_user_threshold?(user_count) do
# Both of these are used, depending on the server implementation
description =
[
get_in(nodeinfo, ["metadata", "description"]),
get_in(nodeinfo, ["metadata", "nodeDescription"]),
# pixelfed
get_in(nodeinfo, ["metadata", "config", "site", "description"])
]
|> Enum.filter(fn d -> d != nil end)
|> Enum.at(0)
type = nodeinfo |> get_in(["software", "name"]) |> String.downcase() |> String.to_atom()
2019-08-27 13:50:16 +00:00
Map.merge(
ApiCrawler.get_default(),
%{
description: description,
user_count: handle_count(user_count),
status_count: nodeinfo |> get_in(["usage", "localPosts"]) |> handle_count(),
2019-08-27 13:50:16 +00:00
instance_type: type,
version: get_in(nodeinfo, ["software", "version"]),
2019-08-29 16:54:34 +00:00
federation_restrictions: get_federation_restrictions(nodeinfo)
2019-08-27 13:50:16 +00:00
}
)
else
2019-08-27 13:50:16 +00:00
Map.merge(
ApiCrawler.get_default(),
%{
user_count: user_count
}
)
end
end
@spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
defp is_compatible_nodeinfo_version?(schema_url) do
version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
Enum.member?(["1.0", "1.1", "2.0"], version)
end
2019-08-29 16:54:34 +00:00
@spec get_federation_restrictions(any()) :: [ApiCrawler.federation_restriction()]
defp get_federation_restrictions(nodeinfo) do
mrf_simple = get_in(nodeinfo, ["metadata", "federation", "mrf_simple"])
quarantined_domains = get_in(nodeinfo, ["metadata", "federation", "quarantined_instances"])
quarantined_domains =
if quarantined_domains == nil do
[]
else
Enum.map(quarantined_domains, fn domain -> {domain, "quarantine"} end)
end
if mrf_simple != nil do
mrf_simple
|> Map.take([
"report_removal",
"reject",
"media_removal",
"media_nsfw",
"federated_timeline_removal",
"banner_removal",
"avatar_removal",
"accept"
])
|> Enum.flat_map(fn {type, domains} ->
2023-04-29 15:52:25 +00:00
# credo:disable-for-next-line Credo.Check.Refactor.Nesting
2019-08-29 16:54:34 +00:00
Enum.map(domains, fn domain -> {domain, type} end)
end)
|> Enum.concat(quarantined_domains)
else
quarantined_domains
end
end
# handle a count that may be formatted as a string or an integer
defp handle_count(count) do
if is_integer(count) do
count
else
{count, _rem} = Integer.parse(count)
count
end
end
end