Revert "add metadata endpoint"

This reverts commit 82153b283b3be8e7a48da92a6d02d05ef28e98c5.
pull/1/head
Tao Bojlén 2019-08-09 16:59:51 +00:00
rodzic 9a0bbbb7d9
commit 3320e050c8
14 zmienionych plików z 477 dodań i 170 usunięć

Wyświetl plik

@ -11,10 +11,12 @@ defmodule Backend.Crawler.ApiCrawler do
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses * Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
""" """
alias Backend.Crawler.Crawlers.Nodeinfo
# {domain_mentioned, count} # {domain_mentioned, count}
@type instance_interactions :: %{String.t() => integer} @type instance_interactions :: %{String.t() => integer}
@type instance_type :: :mastodon | :pleroma | :gab | :misskey @type instance_type :: :mastodon | :pleroma | :gab | :misskey | :gnusocial
defstruct [ defstruct [
:version, :version,
@ -30,8 +32,8 @@ defmodule Backend.Crawler.ApiCrawler do
@type t() :: %__MODULE__{ @type t() :: %__MODULE__{
version: String.t(), version: String.t(),
description: String.t(), description: String.t(),
user_count: integer, user_count: integer | nil,
status_count: integer, status_count: integer | nil,
peers: [String.t()], peers: [String.t()],
interactions: instance_interactions, interactions: instance_interactions,
statuses_seen: integer, statuses_seen: integer,
@ -40,8 +42,9 @@ defmodule Backend.Crawler.ApiCrawler do
@doc """ @doc """
Check whether the instance at the given domain is of the type that this ApiCrawler implements. Check whether the instance at the given domain is of the type that this ApiCrawler implements.
Arguments are the instance domain and the nodeinfo results.
""" """
@callback is_instance_type?(String.t()) :: boolean() @callback is_instance_type?(String.t(), Nodeinfo.t()) :: boolean()
@doc """ @doc """
Check whether the instance allows crawling according to its robots.txt or otherwise. Check whether the instance allows crawling according to its robots.txt or otherwise.
@ -50,6 +53,7 @@ defmodule Backend.Crawler.ApiCrawler do
@doc """ @doc """
Crawl the instance at the given domain. Crawl the instance at the given domain.
Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
""" """
@callback crawl(String.t()) :: t() @callback crawl(String.t(), Nodeinfo.t()) :: t()
end end

Wyświetl plik

@ -4,7 +4,7 @@ defmodule Backend.Crawler do
""" """
alias __MODULE__ alias __MODULE__
alias Backend.Crawler.Crawlers.{Mastodon, Misskey} alias Backend.Crawler.Crawlers.{GnuSocial, Mastodon, Misskey, Nodeinfo}
alias Backend.Crawler.ApiCrawler alias Backend.Crawler.ApiCrawler
alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer} alias Backend.{Crawl, CrawlInteraction, Repo, Instance, InstancePeer}
import Ecto.Query import Ecto.Query
@ -16,8 +16,8 @@ defmodule Backend.Crawler do
:domain, :domain,
# a list of ApiCrawlers that will be attempted # a list of ApiCrawlers that will be attempted
:api_crawlers, :api_crawlers,
:found_api?,
:allows_crawling?, :allows_crawling?,
:found_api?,
:result, :result,
:error :error
] ]
@ -25,8 +25,8 @@ defmodule Backend.Crawler do
@type t() :: %__MODULE__{ @type t() :: %__MODULE__{
domain: String.t(), domain: String.t(),
api_crawlers: [ApiCrawler.t()], api_crawlers: [ApiCrawler.t()],
found_api?: boolean,
allows_crawling?: boolean, allows_crawling?: boolean,
found_api?: boolean,
result: ApiCrawler.t() | nil, result: ApiCrawler.t() | nil,
error: String.t() | nil error: String.t() | nil
} }
@ -37,16 +37,18 @@ defmodule Backend.Crawler do
state = %Crawler{ state = %Crawler{
domain: domain, domain: domain,
api_crawlers: [], api_crawlers: [],
found_api?: false,
allows_crawling?: true, allows_crawling?: true,
found_api?: false,
result: nil, result: nil,
error: nil error: nil
} }
state state
# register APICrawlers here # These crawlers are run in the order they're registered. Nodeinfo should be the first one.
|> register(Nodeinfo)
|> register(Mastodon) |> register(Mastodon)
|> register(Misskey) |> register(Misskey)
|> register(GnuSocial)
# go! # go!
|> crawl() |> crawl()
|> save() |> save()
@ -56,33 +58,47 @@ defmodule Backend.Crawler do
# Adds a new ApiCrawler that run/1 will check. # Adds a new ApiCrawler that run/1 will check.
defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do defp register(%Crawler{api_crawlers: crawlers} = state, api_crawler) do
Map.put(state, :api_crawlers, [api_crawler | crawlers]) Map.put(state, :api_crawlers, crawlers ++ [api_crawler])
end end
# Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read. # Recursive function to check whether `domain` has an API that the head of the api_crawlers list can read.
# If so, crawls it. If not, continues with the tail of the api_crawlers list. # If so, crawls it. If not, continues with the tail of the api_crawlers list.
defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do defp crawl(%Crawler{api_crawlers: [], domain: domain} = state) do
Logger.debug("Found no compatible API for #{domain}") Logger.debug("Found no compatible API for #{domain}")
Map.put(state, :found_api?, false) state
end end
defp crawl(%Crawler{domain: domain, api_crawlers: [curr | remaining_crawlers]} = state) do # Nodeinfo is distinct from other crawlers in that
if curr.is_instance_type?(domain) do # a) it should always be run first
# b) it passes the results on to the next crawlers (e.g. user_count)
defp crawl(%Crawler{api_crawlers: [Nodeinfo | remaining_crawlers], domain: domain} = state) do
with true <- Nodeinfo.allows_crawling?(domain), {:ok, nodeinfo} <- Nodeinfo.crawl(domain) do
Logger.debug("Found nodeinfo for #{domain}.")
result = Map.merge(nodeinfo, %{peers: [], interactions: %{}, statuses_seen: 0})
crawl(%Crawler{state | result: result, found_api?: true, api_crawlers: remaining_crawlers})
else
_ ->
Logger.debug("Did not find nodeinfo for #{domain}.")
crawl(%Crawler{state | api_crawlers: remaining_crawlers})
end
end
defp crawl(
%Crawler{domain: domain, result: result, api_crawlers: [curr | remaining_crawlers]} =
state
) do
if curr.is_instance_type?(domain, result) do
Logger.debug("Found #{curr} instance") Logger.debug("Found #{curr} instance")
state = Map.put(state, :found_api?, true)
if curr.allows_crawling?(domain) do if curr.allows_crawling?(domain) do
try do try do
%Crawler{state | result: curr.crawl(domain), api_crawlers: []} %Crawler{state | result: curr.crawl(domain, result), found_api?: true}
rescue rescue
e in HTTPoison.Error -> e in HTTPoison.Error ->
Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e)) Map.put(state, :error, "HTTPoison error: " <> HTTPoison.Error.message(e))
e in Jason.DecodeError -> e in Jason.DecodeError ->
Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e)) Map.put(state, :error, "Jason DecodeError: " <> Jason.DecodeError.message(e))
e in _ ->
Map.put(state, :error, "Unknown error: " <> inspect(e))
end end
else else
Logger.debug("#{domain} does not allow crawling.") Logger.debug("#{domain} does not allow crawling.")
@ -99,9 +115,9 @@ defmodule Backend.Crawler do
defp save(%Crawler{ defp save(%Crawler{
domain: domain, domain: domain,
result: result, result: result,
found_api?: true,
error: nil, error: nil,
allows_crawling?: true allows_crawling?: true,
found_api?: true
}) do }) do
now = get_now() now = get_now()
@ -240,7 +256,7 @@ defmodule Backend.Crawler do
cond do cond do
not allows_crawling -> "robots.txt" not allows_crawling -> "robots.txt"
error == nil -> "no api found" error == nil -> "no api found"
true -> "unknown error" true -> error
end end
# The "+1" is this error! # The "+1" is this error!
@ -250,25 +266,25 @@ defmodule Backend.Crawler do
|> Map.get(:crawl_error_count) |> Map.get(:crawl_error_count)
|> Kernel.+(1) |> Kernel.+(1)
# The crawl interval grows exponentially at first but never goes above 72 hours # The crawl interval grows exponentially at first but never goes above 24 hours
crawl_interval_mins = crawl_interval_mins =
min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 4320) min(get_config(:crawl_interval_mins) * round(:math.pow(2, error_count)), 1440)
next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second) next_crawl = NaiveDateTime.add(now, crawl_interval_mins * 60, :second)
Repo.transaction(fn -> Repo.insert!(
Repo.insert!( %Instance{
%Instance{ domain: domain,
domain: domain, base_domain: get_base_domain(domain),
base_domain: get_base_domain(domain), crawl_error: error,
crawl_error: error, crawl_error_count: error_count,
crawl_error_count: error_count, next_crawl: next_crawl,
next_crawl: next_crawl updated_at: now
}, },
on_conflict: {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl]}, on_conflict:
conflict_target: :domain {:replace, [:base_domain, :crawl_error, :crawl_error_count, :next_crawl, :updated_at]},
) conflict_target: :domain
end) )
Appsignal.increment_counter("crawler.failure", 1) Appsignal.increment_counter("crawler.failure", 1)
end end

Wyświetl plik

@ -0,0 +1,178 @@
defmodule Backend.Crawler.Crawlers.GnuSocial do
alias Backend.Crawler.ApiCrawler
alias Backend.Crawler.Crawlers.Nodeinfo
import Backend.Crawler.Util
import Backend.Util
require Logger
@behaviour ApiCrawler
@impl ApiCrawler
def is_instance_type?(_domain, nodeinfo_result) do
nodeinfo_result != nil and Map.get(nodeinfo_result, :instance_type) == :gnusocial
end
@impl ApiCrawler
def allows_crawling?(domain) do
[
"/api/statuses/public_timeline.json"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
@impl ApiCrawler
def crawl(domain, nodeinfo_result) do
if nodeinfo_result |> Map.get(:user_count) |> is_above_user_threshold?() do
crawl_large_instance(domain, nodeinfo_result)
else
nodeinfo_result
end
end
@spec crawl_large_instance(String.t(), Nodeinfo.t()) :: ApiCrawler.t()
defp crawl_large_instance(domain, nodeinfo_result) do
status_datetime_threshold =
NaiveDateTime.utc_now()
|> NaiveDateTime.add(get_config(:status_age_limit_days) * 24 * 3600 * -1, :second)
# Don't get any statuses older than this
min_timestamp = max_datetime(get_last_crawl_timestamp(domain), status_datetime_threshold)
{interactions, statuses_seen} = get_interactions(domain, min_timestamp)
Map.merge(nodeinfo_result, %{
interactions: interactions,
statuses_seen: statuses_seen,
peers: []
})
end
@spec get_interactions(
String.t(),
NaiveDateTime.t(),
String.t() | nil,
ApiCrawler.instance_interactions(),
integer()
) :: {ApiCrawler.instance_interactions(), integer()}
defp get_interactions(
domain,
min_timestamp,
max_id \\ nil,
interactions \\ %{},
statuses_seen \\ 0
) do
endpoint = "https://#{domain}/api/statuses/public_timeline.json"
endpoint =
if max_id != nil do
endpoint <> "?max_id=#{max_id}"
else
endpoint
end
Logger.debug("Crawling #{endpoint}")
statuses = get_and_decode!(endpoint)
# Filter to statuses that are in the correct timeframe
filtered_statuses =
statuses
|> Enum.filter(fn s ->
s["created_at"]
|> parse_timestamp()
|> is_after?(min_timestamp)
end)
if length(filtered_statuses) > 0 do
# Filter down further to statuses that a) aren't faves and b) aren't from #nobot users
eligible_statuses =
filtered_statuses |> Enum.filter(fn s -> not is_fave?(s) and not has_nobot?(s) end)
# get statuses that are eligible (i.e. users don't have #nobot in their profile), have mentions, and are not faves
interactions =
eligible_statuses
|> statuses_to_interactions()
|> merge_count_maps(interactions)
statuses_seen =
eligible_statuses
|> Kernel.length()
|> Kernel.+(statuses_seen)
oldest_status = Enum.at(filtered_statuses, -1)
oldest_status_datetime =
oldest_status
|> Map.get("created_at")
|> parse_timestamp()
if NaiveDateTime.compare(oldest_status_datetime, min_timestamp) == :gt and
statuses_seen < get_config(:status_count_limit) and
length(filtered_statuses) == length(statuses) do
get_interactions(domain, min_timestamp, oldest_status["id"], interactions, statuses_seen)
else
{interactions, statuses_seen}
end
else
{interactions, statuses_seen}
end
end
@spec statuses_to_interactions(any()) :: ApiCrawler.instance_interactions()
defp statuses_to_interactions(statuses) do
statuses
|> Enum.filter(fn status -> is_mention?(status) end)
|> Enum.map(fn status -> extract_mentions_from_status(status) end)
|> Enum.reduce(%{}, fn map, acc ->
Map.merge(acc, map)
end)
end
# Checks whether the status contains one or more mentions
@spec is_mention?(any()) :: boolean()
defp is_mention?(%{"attentions" => []}) do
false
end
defp is_mention?(_status) do
true
end
@spec is_fave?(any()) :: boolean()
defp is_fave?(status) do
uri_elements = status |> Map.get("uri") |> String.split(":")
Enum.member?(uri_elements, "fave")
end
@spec has_nobot?(any()) :: boolean()
defp has_nobot?(status) do
case get_in(status, ["user", "description"]) do
nil ->
false
description ->
description
|> String.downcase()
|> String.contains?("nobot")
end
end
@spec extract_mentions_from_status(any()) :: ApiCrawler.instance_interactions()
defp extract_mentions_from_status(status) do
status["attentions"]
|> Enum.map(fn mention -> get_domain(mention["profileurl"]) end)
|> Enum.reduce(%{}, fn domain, acc ->
Map.update(acc, domain, 1, &(&1 + 1))
end)
end
# Parses the messed-up time format that GNU social uses
# Like seriously, it's 2019, why *wouldn't* you use iso8601?
@spec parse_timestamp(String.t()) :: NaiveDateTime.t()
defp parse_timestamp(timestamp) do
timestamp
|> Timex.parse!("{WDshort} {Mshort} {0D} {h24}:{0m}:{0s} {0Z} {YYYY}")
|> Timex.to_naive_datetime()
end
end

Wyświetl plik

@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
@behaviour ApiCrawler @behaviour ApiCrawler
@impl ApiCrawler @impl ApiCrawler
def is_instance_type?(domain) do def is_instance_type?(domain, result) do
case get("https://#{domain}/api/v1/instance") do # We might already know that this is a Pleroma instance from nodeinfo
{:ok, response} -> if is_http_200?(response), do: has_title?(response.body), else: false if result != nil and Map.get(result, :instance_type) == :pleroma do
{:error, _error} -> false true
else
case get_and_decode("https://#{domain}/api/v1/instance") do
{:ok, %{"title" => _title}} -> true
_other -> false
end
end end
end end
@ -26,8 +31,8 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end end
@impl ApiCrawler @impl ApiCrawler
def crawl(domain) do def crawl(domain, _current_result) do
instance = Jason.decode!(get!("https://#{domain}/api/v1/instance").body) instance = get_and_decode!("https://#{domain}/api/v1/instance")
user_count = get_in(instance, ["stats", "user_count"]) user_count = get_in(instance, ["stats", "user_count"])
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
@ -51,12 +56,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
@spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t() @spec crawl_large_instance(String.t(), any()) :: ApiCrawler.t()
defp crawl_large_instance(domain, instance) do defp crawl_large_instance(domain, instance) do
# servers may not publish peers peers = get_peers(domain)
peers =
case get("https://#{domain}/api/v1/instance/peers") do
{:ok, response} -> if is_http_200?(response), do: Jason.decode!(response.body), else: []
{:error, _error} -> []
end
Logger.debug("Found #{length(peers)} peers.") Logger.debug("Found #{length(peers)} peers.")
@ -124,15 +124,15 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
Logger.debug("Crawling #{endpoint}") Logger.debug("Crawling #{endpoint}")
statuses = statuses = get_and_decode!(endpoint)
endpoint
|> get!()
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses = filtered_statuses =
statuses statuses
|> Enum.filter(fn s -> is_after?(s["created_at"], min_timestamp) end) |> Enum.filter(fn s ->
s["created_at"]
|> NaiveDateTime.from_iso8601!()
|> is_after?(min_timestamp)
end)
if length(filtered_statuses) > 0 do if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
@ -166,12 +166,11 @@ defmodule Backend.Crawler.Crawlers.Mastodon do
end end
end end
# To check if the endpoint works as expected defp get_peers(domain) do
@spec has_title?(String.t()) :: boolean # servers may not publish peers
defp has_title?(body) do case get_and_decode("https://#{domain}/api/v1/instance/peers") do
case Jason.decode(body) do {:ok, peers} -> peers
{:ok, decoded} -> Map.has_key?(decoded, "title") {:error, _err} -> []
{:error, _error} -> false
end end
end end

Wyświetl plik

@ -7,10 +7,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do
require Logger require Logger
@impl ApiCrawler @impl ApiCrawler
def is_instance_type?(domain) do def is_instance_type?(domain, result) do
case get_version_and_description(domain) do # We may already know that this is a Misskey instance from nodeinfo
{:ok, _} -> true if result != nil and Map.get(result, :instance_type) == :misskey do
{:error, _} -> false true
else
case get_version_and_description(domain) do
{:ok, _} -> true
{:error, _} -> false
end
end end
end end
@ -27,11 +32,9 @@ defmodule Backend.Crawler.Crawlers.Misskey do
end end
@impl ApiCrawler @impl ApiCrawler
def crawl(domain) do def crawl(domain, _result) do
with {:ok, %{status_code: 200, body: stats_body}} <- post("https://#{domain}/api/stats") do with {:ok, %{"originalUsersCount" => user_count, "originalNotesCount" => status_count}} <-
%{"originalUsersCount" => user_count, "originalNotesCount" => status_count} = post_and_decode("https://#{domain}/api/stats") do
Jason.decode!(stats_body)
if is_above_user_threshold?(user_count) or has_opted_in?(domain) do if is_above_user_threshold?(user_count) or has_opted_in?(domain) do
crawl_large_instance(domain, user_count, status_count) crawl_large_instance(domain, user_count, status_count)
else else
@ -107,15 +110,15 @@ defmodule Backend.Crawler.Crawlers.Misskey do
Logger.debug("Crawling #{endpoint} with untilId=#{until_id}") Logger.debug("Crawling #{endpoint} with untilId=#{until_id}")
statuses = statuses = post_and_decode!(endpoint, Jason.encode!(params))
endpoint
|> post!(Jason.encode!(params))
|> Map.get(:body)
|> Jason.decode!()
filtered_statuses = filtered_statuses =
statuses statuses
|> Enum.filter(fn s -> is_after?(s["createdAt"], min_timestamp) end) |> Enum.filter(fn s ->
s["createdAt"]
|> NaiveDateTime.from_iso8601!()
|> is_after?(min_timestamp)
end)
if length(filtered_statuses) > 0 do if length(filtered_statuses) > 0 do
# get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions # get statuses that are eligible (i.e. users don't have #nobot in their profile) and have mentions
@ -151,35 +154,22 @@ defmodule Backend.Crawler.Crawlers.Misskey do
end end
@spec get_version_and_description(String.t()) :: @spec get_version_and_description(String.t()) ::
{:ok, {String.t(), String.t()}} | {:error, String.t()} {:ok, {String.t(), String.t()}} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_version_and_description(domain) do defp get_version_and_description(domain) do
case post("https://#{domain}/api/meta") do case post_and_decode("https://#{domain}/api/meta") do
{:ok, %{status_code: 200, body: body}} -> {:ok, %{"version" => version, "description" => description}} ->
case Jason.decode(body) do {:ok, {version, description}}
{:ok, decoded} ->
{:ok, {Map.get(decoded, "version"), Map.get(decoded, "description")}}
{:error, _error} -> {:error, err} ->
{:error, "invalid response"} {:error, err}
end
_ ->
{:error, "unsuccesful request"}
end end
end end
@spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()} @spec get_peers(String.t()) :: {:ok, [String.t()]} | {:error, Jason.DecodeError.t()}
defp get_peers(domain) do defp get_peers(domain) do
case get("https://#{domain}/api/v1/instance/peers") do case get_and_decode("https://#{domain}/api/v1/instance/peers") do
{:ok, response} -> {:ok, peers} -> {:ok, peers}
with %{status_code: 200, body: body} <- response do {:error, _} -> {:ok, []}
Jason.decode(body)
else
_ -> {:ok, []}
end
{:error, _} ->
{:ok, []}
end end
end end

Wyświetl plik

@ -0,0 +1,117 @@
defmodule Backend.Crawler.Crawlers.Nodeinfo do
alias Backend.Crawler.ApiCrawler
require Logger
import Backend.Util
import Backend.Crawler.Util
@moduledoc """
This module is slightly different from the other crawlers.
It doesn't implement the ApiCrawler spec because it isn't run as a self-contained crawler.
Instead, it's run before all the other crawlers.
This is to get the user count. Some servers don't publish this in other places (e.g. GNU Social, PeerTube) so we need
nodeinfo to know whether it's a personal instance or not.
"""
defstruct [
:description,
:user_count,
:status_count,
:instance_type,
:version
]
@type t() :: %__MODULE__{
description: String.t(),
user_count: integer,
status_count: integer,
instance_type: ApiCrawler.instance_type(),
version: String.t()
}
@spec allows_crawling?(String.t()) :: boolean()
def allows_crawling?(domain) do
[
".well-known/nodeinfo"
]
|> Enum.map(fn endpoint -> "https://#{domain}#{endpoint}" end)
|> urls_are_crawlable?()
end
@spec crawl(String.t()) :: {:ok, t()} | {:error, nil}
def crawl(domain) do
with {:ok, nodeinfo_url} <- get_nodeinfo_url(domain),
{:ok, nodeinfo} <- get_nodeinfo(nodeinfo_url) do
{:ok, nodeinfo}
else
_other -> {:error, nil}
end
end
@spec get_nodeinfo_url(String.t()) ::
{:ok, String.t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_nodeinfo_url(domain) do
case get_and_decode("https://#{domain}/.well-known/nodeinfo") do
{:ok, response} -> {:ok, process_nodeinfo_url(response)}
{:error, err} -> {:error, err}
end
end
@spec process_nodeinfo_url(any()) :: String.t()
defp process_nodeinfo_url(response) do
response
|> Map.get("links")
|> Enum.filter(fn %{"rel" => rel} -> is_compatible_nodeinfo_version?(rel) end)
|> Kernel.hd()
|> Map.get("href")
end
@spec get_nodeinfo(String.t()) ::
{:ok, t()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
defp get_nodeinfo(nodeinfo_url) do
case get_and_decode(nodeinfo_url) do
{:ok, nodeinfo} -> {:ok, process_nodeinfo(nodeinfo)}
{:error, err} -> {:error, err}
end
end
@spec process_nodeinfo(any()) :: t()
defp process_nodeinfo(nodeinfo) do
user_count = get_in(nodeinfo, ["usage", "users", "total"])
if is_above_user_threshold?(user_count) do
# Both of these are used, depending on the server implementation
description =
[
get_in(nodeinfo, ["metadata", "description"]),
get_in(nodeinfo, ["metadata", "nodeDescription"])
]
|> Enum.filter(fn d -> d != nil end)
|> Enum.at(0)
type = nodeinfo |> get_in(["software", "name"]) |> String.downcase() |> String.to_atom()
%__MODULE__{
description: description,
user_count: user_count,
status_count: get_in(nodeinfo, ["usage", "localPosts"]),
instance_type: type,
version: get_in(nodeinfo, ["software", "version"])
}
else
%{
description: nil,
user_count: user_count,
status_count: nil,
instance_type: nil,
version: nil
}
end
end
@spec is_compatible_nodeinfo_version?(String.t()) :: boolean()
defp is_compatible_nodeinfo_version?(schema_url) do
version = String.slice(schema_url, (String.length(schema_url) - 3)..-1)
Enum.member?(["1.0", "1.1", "2.0"], version)
end
end

Wyświetl plik

@ -54,7 +54,7 @@ defmodule Backend.Crawler.StaleInstanceManager do
stale_domains = stale_domains =
Instance Instance
|> select([i], i.domain) |> select([i], i.domain)
|> where([i], i.next_crawl < ^now) |> where([i], i.next_crawl < ^now and not i.opt_out)
|> Repo.all() |> Repo.all()
|> MapSet.new() |> MapSet.new()

Wyświetl plik

@ -8,27 +8,19 @@ defmodule Backend.Crawler.Util do
# (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser) # (e.g. https://mastodon.social/@demouser or https://pleroma.site/users/demouser)
@spec get_domain(String.t()) :: String.t() @spec get_domain(String.t()) :: String.t()
def get_domain(url) do def get_domain(url) do
String.slice(url, 8..-1) [_match, domain] = Regex.run(~r/https?:\/\/([\w.-]+)\/.*/, url)
|> String.split("/") domain
|> Enum.at(0)
end end
@spec is_http_200?(HTTPoison.Response.t()) :: boolean @doc """
def is_http_200?(%{status_code: 200}) do Returns true if the first argument is after the second.
true """
end @spec is_after?(NaiveDateTime.t(), NaiveDateTime.t() | nil) :: boolean()
def is_http_200?(_) do
false
end
@spec is_after?(String.t(), NaiveDateTime.t() | nil) :: boolean()
def is_after?(timestamp, threshold) do def is_after?(timestamp, threshold) do
if threshold == nil do if threshold == nil do
true true
else else
timestamp timestamp
|> NaiveDateTime.from_iso8601!()
# :second is the granularity used in the database # :second is the granularity used in the database
|> NaiveDateTime.truncate(:second) |> NaiveDateTime.truncate(:second)
|> NaiveDateTime.compare(threshold) |> NaiveDateTime.compare(threshold)
@ -36,49 +28,6 @@ defmodule Backend.Crawler.Util do
end end
end end
def get(url) do
# TODO: add version number to user agent?
HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
@spec get!(binary) :: %{
:__struct__ => HTTPoison.AsyncResponse | HTTPoison.Response,
optional(:body) => any,
optional(:headers) => [any],
optional(:id) => reference,
optional(:request) => HTTPoison.Request.t(),
optional(:request_url) => any,
optional(:status_code) => integer
}
def get!(url) do
# TODO: add version number to user agent?
HTTPoison.get!(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
def post(url, body \\ "") do
HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
def post!(url, body \\ "") do
HTTPoison.post!(url, body, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
)
end
@spec urls_are_crawlable?([String.t()]) :: boolean() @spec urls_are_crawlable?([String.t()]) :: boolean()
def urls_are_crawlable?(urls) do def urls_are_crawlable?(urls) do
user_agent = get_config(:user_agent) user_agent = get_config(:user_agent)

Wyświetl plik

@ -145,4 +145,54 @@ defmodule Backend.Util do
def convert_keys_to_atoms(map) do def convert_keys_to_atoms(map) do
map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end) map |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
end end
@doc """
Gets and decodes a HTTP response.
"""
@spec get_and_decode(String.t()) ::
{:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
def get_and_decode(url) do
case HTTPoison.get(url, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
) do
{:ok, %{status_code: 200, body: body}} -> Jason.decode(body)
{:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
{:error, err} -> {:error, err}
end
end
@spec get_and_decode!(String.t()) :: any()
def get_and_decode!(url) do
case get_and_decode(url) do
{:ok, decoded} -> decoded
{:error, error} -> raise error
end
end
@doc """
POSTS to a HTTP endpoint and decodes the JSON response.
"""
@spec post_and_decode(String.t(), String.t()) ::
{:ok, any()} | {:error, Jason.DecodeError.t() | HTTPoison.Error.t()}
def post_and_decode(url, body \\ "") do
case HTTPoison.post(url, body, [{"User-Agent", get_config(:user_agent)}],
hackney: [pool: :crawler],
recv_timeout: 15000,
timeout: 15000
) do
{:ok, %{status_code: 200, body: response_body}} -> Jason.decode(response_body)
{:ok, _} -> {:error, %HTTPoison.Error{reason: "Non-200 response"}}
{:error, err} -> {:error, err}
end
end
@spec post_and_decode!(String.t(), String.t()) :: any()
def post_and_decode!(url, body \\ "") do
case post_and_decode(url, body) do
{:ok, decoded} -> decoded
{:error, error} -> raise error
end
end
end end

Wyświetl plik

@ -14,10 +14,7 @@ defmodule BackendWeb.AdminLoginController do
# TODO: this assumes mastodon/pleroma API # TODO: this assumes mastodon/pleroma API
cleaned_domain = clean_domain(domain) cleaned_domain = clean_domain(domain)
instance_data = instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
|> Map.get(:body)
|> Jason.decode!()
render(conn, "show.json", instance_data: instance_data, cleaned_domain: cleaned_domain) render(conn, "show.json", instance_data: instance_data, cleaned_domain: cleaned_domain)
end end
@ -25,10 +22,7 @@ defmodule BackendWeb.AdminLoginController do
def create(conn, %{"domain" => domain, "type" => type}) do def create(conn, %{"domain" => domain, "type" => type}) do
cleaned_domain = clean_domain(domain) cleaned_domain = clean_domain(domain)
instance_data = instance_data = get_and_decode!("https://#{cleaned_domain}/api/v1/instance")
HTTPoison.get!("https://#{cleaned_domain}/api/v1/instance")
|> Map.get(:body)
|> Jason.decode!()
error = error =
cond do cond do

Wyświetl plik

@ -3,7 +3,7 @@ import { IconNames } from "@blueprintjs/icons";
import React from "react"; import React from "react";
import { QUALITATIVE_COLOR_SCHEME } from "../../constants"; import { QUALITATIVE_COLOR_SCHEME } from "../../constants";
import { typeColorScheme } from "../../types"; import { typeColorScheme } from "../../types";
import { capitalize } from "../../util"; import { getTypeDisplayString } from "../../util";
interface IInstanceTypeProps { interface IInstanceTypeProps {
type: string; type: string;
@ -15,7 +15,7 @@ interface IInstanceTypeProps {
*/ */
const InstanceType: React.FC<IInstanceTypeProps> = ({ type, colorAfterName }) => { const InstanceType: React.FC<IInstanceTypeProps> = ({ type, colorAfterName }) => {
const idx = typeColorScheme.values.indexOf(type); const idx = typeColorScheme.values.indexOf(type);
const name = " " + capitalize(type); const name = " " + getTypeDisplayString(type);
return ( return (
<> <>
{!!colorAfterName && name} {!!colorAfterName && name}

Wyświetl plik

@ -4,7 +4,7 @@ import React, { MouseEvent } from "react";
import styled from "styled-components"; import styled from "styled-components";
import { INSTANCE_TYPES } from "../../constants"; import { INSTANCE_TYPES } from "../../constants";
import { getSearchFilterDisplayValue, ISearchFilter } from "../../searchFilters"; import { getSearchFilterDisplayValue, ISearchFilter } from "../../searchFilters";
import { capitalize } from "../../util"; import { getTypeDisplayString } from "../../util";
const SearchFilterContainer = styled.div` const SearchFilterContainer = styled.div`
margin: 10px 0 0 0; margin: 10px 0 0 0;
@ -30,7 +30,7 @@ const SearchFilters: React.FC<ISearchFiltersProps> = ({ selectedFilters, selectF
const handleSelectInstanceType = (e: MouseEvent<HTMLElement>) => { const handleSelectInstanceType = (e: MouseEvent<HTMLElement>) => {
const field = "type"; const field = "type";
const relation = "eq"; const relation = "eq";
const value = e.currentTarget.innerText.toLowerCase(); const value = e.currentTarget.innerText.toLowerCase().replace(" ", "");
const filter: ISearchFilter = { const filter: ISearchFilter = {
displayValue: getSearchFilterDisplayValue(field, relation, value), displayValue: getSearchFilterDisplayValue(field, relation, value),
field, field,
@ -43,7 +43,7 @@ const SearchFilters: React.FC<ISearchFiltersProps> = ({ selectedFilters, selectF
<Menu> <Menu>
<MenuItem icon={IconNames.SYMBOL_CIRCLE} text="Instance type" disabled={hasInstanceTypeFilter}> <MenuItem icon={IconNames.SYMBOL_CIRCLE} text="Instance type" disabled={hasInstanceTypeFilter}>
{INSTANCE_TYPES.map(t => ( {INSTANCE_TYPES.map(t => (
<MenuItem key={t} text={capitalize(t)} onClick={handleSelectInstanceType} /> <MenuItem key={t} text={getTypeDisplayString(t)} onClick={handleSelectInstanceType} />
))} ))}
</MenuItem> </MenuItem>
</Menu> </Menu>

Wyświetl plik

@ -40,4 +40,4 @@ export interface IInstanceDomainPath {
} }
// We could also extract the values from the server response, but this would slow things down... // We could also extract the values from the server response, but this would slow things down...
export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey"]; export const INSTANCE_TYPES = ["mastodon", "gab", "pleroma", "misskey", "gnusocial"];

Wyświetl plik

@ -68,3 +68,13 @@ export const getBuckets = (min: number, max: number, steps: number, exponential:
return range(min, max, bucketSize); return range(min, max, bucketSize);
} }
}; };
const typeToDisplay = {
gnusocial: "GNU Social"
};
export const getTypeDisplayString = (key: string) => {
if (key in typeToDisplay) {
return typeToDisplay[key];
}
return capitalize(key);
};