moonstream/moonstreamapi/moonstreamapi/providers/bugout.py

364 wiersze
13 KiB
Python

"""
Event providers powered by Bugout journals.
"""
import json
import logging
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from bugout.app import Bugout
from bugout.data import BugoutResource, BugoutSearchResult
from bugout.journal import SearchOrder
from dateutil.parser import isoparse
from dateutil.tz import UTC
from sqlalchemy.orm import Session
from .. import data
from ..settings import HUMBUG_TXPOOL_CLIENT_ID
from ..stream_queries import StreamQuery
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARN)
allowed_tags = ["tag:erc721"]
class BugoutEventProviderError(Exception):
"""
Catch-all error for BugoutEventProvider instances.
"""
class BugoutEventProvider:
"""
Provides events (specified by a conjunction of tags) from a Bugout journal.
"""
def __init__(
self,
event_type: str,
description: str,
default_time_interval_seconds: int,
estimated_events_per_time_interval: float,
tags: Optional[List[str]] = None,
batch_size: int = 100,
timeout: float = 30.0,
):
"""
Args:
- event_type: Name of event this instance provides
- tags: Tags which define events that this provider works with
- batch_size: Number of events to read from journal at a time
- timeout: Request timeout for Bugout requests
"""
self.event_type = event_type
self.description = description
self.default_time_interval_seconds = default_time_interval_seconds
self.estimated_events_per_time_interval = estimated_events_per_time_interval
self.batch_size = batch_size
self.timeout = timeout
if tags is None:
tags = []
self.tags: List[str] = tags
self.query = [f"#{tag}" for tag in self.tags]
def validate_subscription(
self, subscription_resource_data: data.SubscriptionResourceData
) -> bool:
"""
This implementation is maximally permissive and returns True for all subscriptions as long as
their subscription_type_id is the configured event_type.
Subclasses of this provider can impose stricter criteria on submissions to the relevant event types.
"""
return subscription_resource_data.subscription_type_id == self.event_type
def entry_event(self, entry: BugoutSearchResult) -> data.Event:
"""
Load an event from a Bugout journal entry. Assumes that the entry content is a JSON string
with no additional markdown formatting.
"""
event_data = {}
if entry.content is not None:
event_data = json.loads(entry.content)
created_at_dt = isoparse(entry.created_at)
created_at_dt = created_at_dt.replace(tzinfo=UTC)
created_at = int(created_at_dt.timestamp())
return data.Event(
event_type=self.event_type,
event_timestamp=created_at,
event_data=event_data,
)
def parse_filters(
self, query: StreamQuery, user_subscriptions: Dict[str, List[BugoutResource]]
) -> Optional[List[str]]:
"""
Subclasses can provide additional constraints to apply to the journal search.
If None is returned, signals that no data should be returned from the provider at all.
"""
is_query_constrained = query.subscription_types or query.subscriptions
relevant_subscriptions = user_subscriptions.get(self.event_type)
if (
is_query_constrained and self.event_type not in query.subscription_types
) or not relevant_subscriptions:
return None
return []
def get_events(
self,
db_session: Session,
bugout_client: Bugout,
data_journal_id: str,
data_access_token: str,
stream_boundary: data.StreamBoundary,
query: StreamQuery,
user_subscriptions: Dict[str, List[BugoutResource]],
) -> Optional[Tuple[data.StreamBoundary, List[data.Event]]]:
"""
Uses journal search endpoint to retrieve events for the given stream boundary and query constraints
from the connected journal.
"""
additional_constraints = self.parse_filters(query, user_subscriptions)
if additional_constraints is None:
return None
time_constraints: List[str] = []
if stream_boundary.start_time > 0:
start_time = datetime.utcfromtimestamp(
stream_boundary.start_time
).isoformat()
operator = ">"
if stream_boundary.include_start:
operator = ">="
time_constraints.append(f"created_at:{operator}{start_time}")
if stream_boundary.end_time is not None:
end_time = datetime.utcfromtimestamp(stream_boundary.end_time).isoformat()
operator = "<"
if stream_boundary.include_end:
operator = "<="
time_constraints.append(f"created_at:{operator}{end_time}")
final_query = " ".join(self.query + time_constraints + additional_constraints)
events: List[data.Event] = []
offset: Optional[int] = 0
while offset is not None:
search_results = bugout_client.search(
data_access_token,
data_journal_id,
final_query,
limit=self.batch_size,
offset=offset,
content=True,
timeout=self.timeout,
order=SearchOrder.DESCENDING,
)
events.extend([self.entry_event(entry) for entry in search_results.results])
offset = search_results.next_offset
return stream_boundary, events
def latest_events(
self,
db_session: Session,
bugout_client: Bugout,
data_journal_id: str,
data_access_token: str,
query: StreamQuery,
num_events: int,
user_subscriptions: Dict[str, List[BugoutResource]],
) -> Optional[List[data.Event]]:
"""
Gets the latest events corresponding to this provider from the given journal.
"""
additional_constraints = self.parse_filters(query, user_subscriptions)
if additional_constraints is None:
return None
if num_events > self.batch_size:
raise BugoutEventProviderError(
f"You requested too many events: event_type={self.event_type}, num_events={num_events}, limit={self.batch_size}"
)
final_query = " ".join(self.query + additional_constraints)
search_results = bugout_client.search(
data_access_token,
data_journal_id,
final_query,
limit=num_events,
content=True,
timeout=self.timeout,
order=SearchOrder.DESCENDING,
)
return [self.entry_event(entry) for entry in search_results.results]
def next_event(
self,
db_session: Session,
bugout_client: Bugout,
data_journal_id: str,
data_access_token: str,
stream_boundary: data.StreamBoundary,
query: StreamQuery,
user_subscriptions: Dict[str, List[BugoutResource]],
) -> Optional[data.Event]:
"""
Get the earliest event that occurred after the time window represented by the given stream boundary.
"""
additional_constraints = self.parse_filters(query, user_subscriptions)
if additional_constraints is None:
return None
if stream_boundary.end_time is None:
raise BugoutEventProviderError(
"Cannot return next event for a stream boundary which is current."
)
end_time = datetime.utcfromtimestamp(stream_boundary.end_time).isoformat()
operator = ">="
if stream_boundary.include_end:
operator = ">"
additional_constraints.append(f"created_at:{operator}{end_time}")
final_query = " ".join(self.query + additional_constraints)
search_results = bugout_client.search(
data_access_token,
data_journal_id,
final_query,
limit=1,
content=True,
timeout=self.timeout,
order=SearchOrder.ASCENDING,
)
if not search_results.results:
return None
return self.entry_event(search_results.results[0])
def previous_event(
self,
db_session: Session,
bugout_client: Bugout,
data_journal_id: str,
data_access_token: str,
stream_boundary: data.StreamBoundary,
query: StreamQuery,
user_subscriptions: Dict[str, List[BugoutResource]],
) -> Optional[data.Event]:
"""
Get the latest event that occurred before the time window represented by the given stream boundary.
"""
additional_constraints = self.parse_filters(query, user_subscriptions)
if additional_constraints is None:
return None
if stream_boundary.start_time == 0:
raise BugoutEventProviderError(
"Cannot return previous event for a stream boundary starting at the beginning of time."
)
start_time = datetime.utcfromtimestamp(stream_boundary.start_time).isoformat()
operator = "<="
if stream_boundary.include_start:
operator = "<"
additional_constraints.append(f"created_at:{operator}{start_time}")
final_query = " ".join(self.query + additional_constraints)
search_results = bugout_client.search(
data_access_token,
data_journal_id,
final_query,
limit=1,
content=True,
timeout=self.timeout,
order=SearchOrder.DESCENDING,
)
if not search_results.results:
return None
return self.entry_event(search_results.results[0])
class EthereumTXPoolProvider(BugoutEventProvider):
def __init__(
self,
event_type: str,
description: str,
default_time_interval_seconds: int,
estimated_events_per_time_interval: float,
tags: Optional[List[str]] = None,
batch_size: int = 100,
timeout: float = 30.0,
):
super().__init__(
event_type=event_type,
description=description,
default_time_interval_seconds=default_time_interval_seconds,
estimated_events_per_time_interval=estimated_events_per_time_interval,
tags=tags,
batch_size=batch_size,
timeout=timeout,
)
def parse_filters(
self, query: StreamQuery, user_subscriptions: Dict[str, List[BugoutResource]]
) -> Optional[List[str]]:
is_query_constrained = query.subscription_types or query.subscriptions
relevant_subscriptions = user_subscriptions.get(self.event_type)
if (
is_query_constrained and self.event_type not in query.subscription_types
) or not relevant_subscriptions:
return None
addresses = [
subscription.resource_data["address"]
for subscription in relevant_subscriptions
]
subscriptions_filters = []
for address in addresses:
if address in allowed_tags:
subscriptions_filters.append(address)
else:
subscriptions_filters.extend(
[f"?#from_address:{address}", f"?#to_address:{address}"]
)
return subscriptions_filters
whalewatch_description = """Event provider for Ethereum whale watch.
Shows the top 10 addresses active on the Ethereum blockchain over the last hour in the following categories:
1. Number of transactions sent
2. Number of transactions received
3. Amount (in WEI) sent
4. Amount (in WEI) received
To restrict your queries to this provider, add a filter of \"type:ethereum_whalewatch\" to your query (query parameter: \"q\") on the /streams endpoint."""
ethereum_whalewatch_provider = BugoutEventProvider(
event_type="ethereum_whalewatch",
description=whalewatch_description,
default_time_interval_seconds=310,
estimated_events_per_time_interval=1,
tags=["crawl_type:ethereum_trending"],
)
polygon_whalewatch_provider = BugoutEventProvider(
event_type="polygon_whalewatch",
description=whalewatch_description,
default_time_interval_seconds=310,
estimated_events_per_time_interval=1,
tags=["crawl_type:polygon_trending"],
)
ethereum_txpool_description = """Event provider for Ethereum transaction pool.
Shows the latest events (from the previous hour) in the Ethereum transaction pool.
To restrict your queries to this provider, add a filter of \"type:ethereum_txpool\" to your query (query parameter: \"q\") on the /streams endpoint."""
ethereum_txpool_provider = EthereumTXPoolProvider(
event_type="ethereum_txpool",
description=ethereum_txpool_description,
default_time_interval_seconds=5,
estimated_events_per_time_interval=50,
tags=[f"client:{HUMBUG_TXPOOL_CLIENT_ID}"],
)