fediverse.space/backend/lib/backend/crawler/api_crawler.ex

84 wiersze
2.6 KiB
Elixir
Czysty Zwykły widok Historia

2019-07-14 11:47:06 +00:00
defmodule Backend.Crawler.ApiCrawler do
@moduledoc """
This module is a specification. Crawlers for all instance types must implement its behaviour.
Make sure to respect the following:
* You must adhere to the following configuration values:
* `:status_age_limit_days` specifies that you must only crawl statuses from the most recent N days
* `:status_count_limit` specifies the max number of statuses to crawl in one go
2019-07-26 14:34:23 +00:00
* `:personal_instance_threshold` specifies that instances with fewer than this number of users should not be crawled (unless :opt_in is true)
2019-07-14 11:47:06 +00:00
* profiles with the string "nobot" (case insensitive) in their profile must not be included in any stats
* Make sure to check the most recent crawl of the instance so you don't re-crawl old statuses
"""
alias Backend.Crawler.Crawlers.Nodeinfo
2019-07-14 11:47:06 +00:00
# {domain_mentioned, count}
@type instance_interactions :: %{String.t() => integer}
2019-08-29 16:54:34 +00:00
# {domain, type} e.g. {"gab.com", "reject"}
@type federation_restriction :: {String.t(), String.t()}
2019-07-14 11:47:06 +00:00
2023-04-29 15:52:25 +00:00
@type instance_type ::
:mastodon | :pleroma | :gab | :misskey | :gnusocial | :smithereen | :friendica
2019-07-24 15:51:44 +00:00
2019-07-14 11:47:06 +00:00
defstruct [
:version,
:description,
:user_count,
:status_count,
:peers,
:interactions,
2019-07-24 15:51:44 +00:00
:statuses_seen,
2019-08-27 13:50:16 +00:00
:instance_type,
2019-08-29 16:54:34 +00:00
:federation_restrictions
2019-07-14 11:47:06 +00:00
]
@type t() :: %__MODULE__{
2019-08-27 13:50:16 +00:00
version: String.t() | nil,
description: String.t() | nil,
user_count: integer | nil,
status_count: integer | nil,
2019-07-14 11:47:06 +00:00
peers: [String.t()],
interactions: instance_interactions,
2019-07-24 15:51:44 +00:00
statuses_seen: integer,
2019-08-27 13:50:16 +00:00
instance_type: instance_type | nil,
2019-08-29 16:54:34 +00:00
federation_restrictions: [federation_restriction]
2019-07-14 11:47:06 +00:00
}
2019-08-27 13:50:16 +00:00
@empty_result %{
version: nil,
description: nil,
user_count: nil,
status_count: nil,
peers: [],
interactions: %{},
statuses_seen: 0,
instance_type: nil,
2019-08-29 16:54:34 +00:00
federation_restrictions: []
2019-08-27 13:50:16 +00:00
}
2019-07-14 11:47:06 +00:00
@doc """
Check whether the instance at the given domain is of the type that this ApiCrawler implements.
Arguments are the instance domain and the nodeinfo results.
2019-07-14 11:47:06 +00:00
"""
2019-08-27 13:50:16 +00:00
@callback is_instance_type?(String.t(), ApiCrawler.t()) :: boolean()
@doc """
Check whether the instance allows crawling according to its robots.txt or otherwise.
"""
@callback allows_crawling?(String.t()) :: boolean()
2019-07-14 11:47:06 +00:00
@doc """
Crawl the instance at the given domain.
Takes two arguments: the domain to crawl and the existing results (from nodeinfo).
2019-07-14 11:47:06 +00:00
"""
@callback crawl(String.t(), Nodeinfo.t()) :: t()
2019-08-27 13:50:16 +00:00
@doc """
Returns the default, empty state
"""
def get_default do
@empty_result
end
2019-07-14 11:47:06 +00:00
end