moonstream/crawlers/mooncrawl/mooncrawl/moonworm_crawler/crawler.py

736 wiersze
26 KiB
Python

import json
import logging
import re
import time
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from uuid import UUID
from bugout.data import BugoutJournalEntries, BugoutSearchResult
from eth_typing.evm import ChecksumAddress
from moonstreamdb.blockchain import AvailableBlockchainType
from moonworm.deployment import find_deployment_block # type: ignore
from web3.main import Web3
from ..blockchain import connect
from ..reporter import reporter
from ..settings import (
BUGOUT_REQUEST_TIMEOUT_SECONDS,
HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES,
HISTORICAL_CRAWLER_STATUSES,
MOONSTREAM_ADMIN_ACCESS_TOKEN,
MOONSTREAM_MOONWORM_TASKS_JOURNAL,
bugout_client,
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SubscriptionTypes(Enum):
POLYGON_BLOCKCHAIN = "polygon_smartcontract"
ETHEREUM_BLOCKCHAIN = "ethereum_smartcontract"
MUMBAI_BLOCKCHAIN = "mumbai_smartcontract"
XDAI_BLOCKCHAIN = "xdai_smartcontract"
WYRM_BLOCKCHAIN = "wyrm_smartcontract"
ZKSYNC_ERA_TESTNET_BLOCKCHAIN = "zksync_era_testnet_smartcontract"
ZKSYNC_ERA_BLOCKCHAIN = "zksync_era_smartcontract"
ARBITRUM_NOVA_BLOCKCHAIN = "arbitrum_nova_smartcontract"
ARBITRUM_SEPOLIA_BLOCKCHAIN = "arbitrum_sepolia_smartcontract"
XAI_BLOCKCHAIN = "xai_smartcontract"
XAI_SEPOLIA_BLOCKCHAIN = "xai_sepolia_smartcontract"
def abi_input_signature(input_abi: Dict[str, Any]) -> str:
"""
Stringifies a function ABI input object according to the ABI specification:
https://docs.soliditylang.org/en/v0.5.3/abi-spec.html
"""
input_type = input_abi["type"]
if input_type.startswith("tuple"):
component_types = [
abi_input_signature(component) for component in input_abi["components"]
]
input_type = f"({','.join(component_types)}){input_type[len('tuple'):]}"
return input_type
def abi_function_signature(function_abi: Dict[str, Any]) -> str:
"""
Stringifies a function ABI according to the ABI specification:
https://docs.soliditylang.org/en/v0.5.3/abi-spec.html
"""
function_name = function_abi["name"]
function_arg_types = [
abi_input_signature(input_item) for input_item in function_abi["inputs"]
]
function_signature = f"{function_name}({','.join(function_arg_types)})"
return function_signature
def encode_function_signature(function_abi: Dict[str, Any]) -> Optional[str]:
"""
Encodes the given function (from ABI) with arguments arg_1, ..., arg_n into its 4 byte signature
by calculating:
keccak256("<function_name>(<arg_1_type>,...,<arg_n_type>")
If function_abi is not actually a function ABI (detected by checking if function_abi["type"] == "function),
returns None.
"""
if function_abi["type"] != "function":
return None
function_signature = abi_function_signature(function_abi)
encoded_signature = Web3.keccak(text=function_signature)[:4]
return encoded_signature.hex()
def _generate_reporter_callback(
crawler_type: str, blockchain_type: AvailableBlockchainType
) -> Callable[[Exception], None]:
def reporter_callback(error: Exception) -> None:
reporter.error_report(
error,
[
"moonworm",
"crawler",
"decode_error",
crawler_type,
blockchain_type.value,
],
)
return reporter_callback
def _retry_connect_web3(
blockchain_type: AvailableBlockchainType,
retry_count: int = 10,
sleep_time: float = 5,
web3_uri: Optional[str] = None,
) -> Web3:
"""
Retry connecting to the blockchain.
"""
while retry_count > 0:
retry_count -= 1
try:
web3 = connect(blockchain_type, web3_uri=web3_uri)
web3.eth.block_number
logger.info(f"Connected to {blockchain_type}")
return web3
except Exception as e:
if retry_count == 0:
error = e
break
logger.error(f"Failed to connect to {blockchain_type} blockchain: {e}")
logger.info(f"Retrying in {sleep_time} seconds")
time.sleep(sleep_time)
raise Exception(
f"Failed to connect to {blockchain_type} blockchain after {retry_count} retries: {error}"
)
def blockchain_type_to_subscription_type(
blockchain_type: AvailableBlockchainType,
) -> SubscriptionTypes:
if blockchain_type == AvailableBlockchainType.ETHEREUM:
return SubscriptionTypes.ETHEREUM_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.POLYGON:
return SubscriptionTypes.POLYGON_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.MUMBAI:
return SubscriptionTypes.MUMBAI_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.XDAI:
return SubscriptionTypes.XDAI_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.WYRM:
return SubscriptionTypes.WYRM_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.ZKSYNC_ERA_TESTNET:
return SubscriptionTypes.ZKSYNC_ERA_TESTNET_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.ZKSYNC_ERA:
return SubscriptionTypes.ZKSYNC_ERA_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.ARBITRUM_NOVA:
return SubscriptionTypes.ARBITRUM_NOVA_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.ARBITRUM_SEPOLIA:
return SubscriptionTypes.ARBITRUM_SEPOLIA_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.XAI:
return SubscriptionTypes.XAI_BLOCKCHAIN
elif blockchain_type == AvailableBlockchainType.XAI_SEPOLIA:
return SubscriptionTypes.XAI_SEPOLIA_BLOCKCHAIN
else:
raise ValueError(f"Unknown blockchain type: {blockchain_type}")
@dataclass
class EventCrawlJob:
event_abi_hash: str
event_abi: Dict[str, Any]
contracts: List[ChecksumAddress]
address_entries: Dict[ChecksumAddress, Dict[UUID, List[str]]]
created_at: int
@dataclass
class FunctionCallCrawlJob:
contract_abi: List[Dict[str, Any]]
contract_address: ChecksumAddress
entries_tags: Dict[UUID, List[str]]
created_at: int
def get_crawl_job_entries(
subscription_type: SubscriptionTypes,
crawler_type: str,
journal_id: str = MOONSTREAM_MOONWORM_TASKS_JOURNAL,
created_at_filter: Optional[int] = None,
limit: int = 200,
extend_tags: Optional[List[str]] = None,
) -> List[BugoutSearchResult]:
"""
Get all event ABIs from bugout journal
where tags are:
- #crawler_type:crawler_type (either event or function)
- #status:active
- #subscription_type:subscription_type (either polygon_blockchain or ethereum_blockchain)
"""
query = f"#status:active #type:{crawler_type} #subscription_type:{subscription_type.value}"
if extend_tags is not None:
for tag in extend_tags:
query += f" {tag.rstrip()}"
if created_at_filter is not None:
# Filtering by created_at
# Filtering not by strictly greater than
# because theoretically we can miss some jobs
# (in the last query bugout didn't return all of by last created_at)
# On the other hand, we may have multiple same jobs that will be filtered out
#
query += f" created_at:>={created_at_filter}"
current_offset = 0
entries: List[BugoutSearchResult] = []
while True:
search_result = bugout_client.search(
token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
journal_id=journal_id,
query=query,
offset=current_offset,
limit=limit,
timeout=BUGOUT_REQUEST_TIMEOUT_SECONDS,
)
search_results = cast(List[BugoutSearchResult], search_result.results)
entries.extend(search_results)
# if len(entries) >= search_result.total_results:
if len(search_results) == 0:
break
current_offset += limit
return entries
def find_all_deployed_blocks(
web3: Web3, addresses_set: List[ChecksumAddress]
) -> Dict[ChecksumAddress, int]:
"""
find all deployed blocks for given addresses
"""
all_deployed_blocks = {}
for address in addresses_set:
try:
code = web3.eth.getCode(address)
if code != "0x":
block = find_deployment_block(
web3_client=web3,
contract_address=address,
web3_interval=0.5,
)
if block is not None:
all_deployed_blocks[address] = block
if block is None:
logger.error(f"Failed to get deployment block for {address}")
except Exception as e:
logger.error(f"Failed to get code for {address}: {e}")
return all_deployed_blocks
def _get_tag(entry: BugoutSearchResult, tag: str) -> str:
for entry_tag in entry.tags:
if entry_tag.startswith(tag):
return entry_tag.split(":")[1]
raise ValueError(f"Tag {tag} not found in {entry}")
def make_event_crawl_jobs(entries: List[BugoutSearchResult]) -> List[EventCrawlJob]:
"""
Create EventCrawlJob objects from bugout entries.
"""
crawl_job_by_hash: Dict[str, EventCrawlJob] = {}
for entry in entries:
abi_hash = _get_tag(entry, "abi_method_hash")
contract_address = Web3().toChecksumAddress(_get_tag(entry, "address"))
entry_id = UUID(entry.entry_url.split("/")[-1]) # crying emoji
existing_crawl_job = crawl_job_by_hash.get(abi_hash)
if existing_crawl_job is not None:
if contract_address not in existing_crawl_job.contracts:
existing_crawl_job.contracts.append(contract_address)
existing_crawl_job.address_entries[contract_address] = {
entry_id: entry.tags
}
else:
abi = cast(str, entry.content)
new_crawl_job = EventCrawlJob(
event_abi_hash=abi_hash,
event_abi=json.loads(abi),
contracts=[contract_address],
address_entries={contract_address: {entry_id: entry.tags}},
created_at=int(datetime.fromisoformat(entry.created_at).timestamp()),
)
crawl_job_by_hash[abi_hash] = new_crawl_job
return [crawl_job for crawl_job in crawl_job_by_hash.values()]
def make_function_call_crawl_jobs(
entries: List[BugoutSearchResult],
) -> List[FunctionCallCrawlJob]:
"""
Create FunctionCallCrawlJob objects from bugout entries.
"""
crawl_job_by_address: Dict[str, FunctionCallCrawlJob] = {}
method_signature_by_address: Dict[str, List[str]] = {}
for entry in entries:
entry_id = UUID(entry.entry_url.split("/")[-1]) # crying emoji
contract_address = Web3().toChecksumAddress(_get_tag(entry, "address"))
abi = json.loads(cast(str, entry.content))
method_signature = encode_function_signature(abi)
if method_signature is None:
raise ValueError(f"{abi} is not a function ABI")
if contract_address not in crawl_job_by_address:
crawl_job_by_address[contract_address] = FunctionCallCrawlJob(
contract_abi=[abi],
contract_address=contract_address,
entries_tags={entry_id: entry.tags},
created_at=int(datetime.fromisoformat(entry.created_at).timestamp()),
)
method_signature_by_address[contract_address] = [method_signature]
else:
if method_signature not in method_signature_by_address[contract_address]:
crawl_job_by_address[contract_address].contract_abi.append(abi)
method_signature_by_address[contract_address].append(method_signature)
crawl_job_by_address[contract_address].entries_tags[
entry_id
] = entry.tags
return [crawl_job for crawl_job in crawl_job_by_address.values()]
def merge_event_crawl_jobs(
old_crawl_jobs: List[EventCrawlJob], new_event_crawl_jobs: List[EventCrawlJob]
) -> List[EventCrawlJob]:
"""
Merge new event crawl jobs with old ones.
If there is a new event crawl job with the same event_abi_hash
then we will merge the contracts to one job.
Othervise new job will be created
Important:
old_crawl_jobs will be modified
Returns:
Merged list of event crawl jobs
"""
for new_crawl_job in new_event_crawl_jobs:
for old_crawl_job in old_crawl_jobs:
if new_crawl_job.event_abi_hash == old_crawl_job.event_abi_hash:
old_crawl_job.contracts.extend(
[
contract
for contract in new_crawl_job.contracts
if contract not in old_crawl_job.contracts
]
)
for contract_address, entries in new_crawl_job.address_entries.items():
if contract_address in old_crawl_job.address_entries:
old_crawl_job.address_entries[contract_address].update(entries)
else:
old_crawl_job.address_entries[contract_address] = entries
break
else:
old_crawl_jobs.append(new_crawl_job)
return old_crawl_jobs
def merge_function_call_crawl_jobs(
old_crawl_jobs: List[FunctionCallCrawlJob],
new_function_call_crawl_jobs: List[FunctionCallCrawlJob],
) -> List[FunctionCallCrawlJob]:
"""
Merge new function call crawl jobs with old ones.
If there is a new function call crawl job with the same contract_address
then we will merge the contracts to one job.
Othervise new job will be created
Important:
old_crawl_jobs will be modified
Returns:
Merged list of function call crawl jobs
"""
for new_crawl_job in new_function_call_crawl_jobs:
for old_crawl_job in old_crawl_jobs:
if new_crawl_job.contract_address == old_crawl_job.contract_address:
old_selectors = [
encode_function_signature(function_abi)
for function_abi in old_crawl_job.contract_abi
]
old_crawl_job.contract_abi.extend(
[
function_abi
for function_abi in new_crawl_job.contract_abi
if encode_function_signature(function_abi) not in old_selectors
]
)
break
else:
old_crawl_jobs.append(new_crawl_job)
return old_crawl_jobs
def _get_heartbeat_entry_id(
crawler_type: str, blockchain_type: AvailableBlockchainType
) -> str:
entries = bugout_client.search(
token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
journal_id=MOONSTREAM_MOONWORM_TASKS_JOURNAL,
query=f"#{crawler_type} #heartbeat #{blockchain_type.value} !#dead",
limit=1,
timeout=BUGOUT_REQUEST_TIMEOUT_SECONDS,
)
search_results = cast(List[BugoutSearchResult], entries.results)
if search_results:
return search_results[0].entry_url.split("/")[-1]
else:
logger.info(f"No {crawler_type} heartbeat entry found, creating one")
entry = bugout_client.create_entry(
token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
journal_id=MOONSTREAM_MOONWORM_TASKS_JOURNAL,
title=f"{crawler_type} Heartbeat - {blockchain_type.value}",
tags=[crawler_type, "heartbeat", blockchain_type.value],
content="",
timeout=BUGOUT_REQUEST_TIMEOUT_SECONDS,
)
return str(entry.id)
def heartbeat(
crawler_type: str,
blockchain_type: AvailableBlockchainType,
crawler_status: Dict[str, Any],
is_dead: bool = False,
) -> None:
"""
Periodically crawler will update the status in bugout entry:
- Started at timestamp
- Started at block number
- Status: Running/Dead
- Last crawled block number
- Number of current jobs
- Time taken to crawl last crawl_step and speed per block
and other information later will be added.
"""
heartbeat_entry_id = _get_heartbeat_entry_id(crawler_type, blockchain_type)
bugout_client.update_entry_content(
token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
journal_id=MOONSTREAM_MOONWORM_TASKS_JOURNAL,
entry_id=heartbeat_entry_id,
title=f"{crawler_type} Heartbeat - {blockchain_type.value}. Status: {crawler_status['status']} - {crawler_status['current_time']}",
content=f"{json.dumps(crawler_status, indent=2)}",
timeout=BUGOUT_REQUEST_TIMEOUT_SECONDS,
)
if is_dead:
bugout_client.update_tags(
token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
journal_id=MOONSTREAM_MOONWORM_TASKS_JOURNAL,
entry_id=heartbeat_entry_id,
tags=[crawler_type, "heartbeat", blockchain_type.value, "dead"],
timeout=BUGOUT_REQUEST_TIMEOUT_SECONDS,
)
def bugout_state_update(
entries_tags_add: List[Dict[str, Any]],
entries_tags_delete: List[Dict[str, Any]],
) -> BugoutJournalEntries:
"""
Run update of entries tags in bugout
First add tags to entries
Second delete tags from entries
With condition that if first step failed, second step will not be executed
"""
new_entreis_state = BugoutJournalEntries(entries=[])
if len(entries_tags_add) > 0:
try:
new_entreis_state = bugout_client.create_entries_tags(
token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
journal_id=MOONSTREAM_MOONWORM_TASKS_JOURNAL,
entries_tags=entries_tags_add,
timeout=BUGOUT_REQUEST_TIMEOUT_SECONDS,
)
except Exception as e:
logger.error(f"Failed to add tags to entries: {e}")
if len(entries_tags_delete) > 0 and (
len(entries_tags_add) < 0 or len(new_entreis_state.entries) > 0
):
try:
new_entreis_state = bugout_client.delete_entries_tags(
token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
journal_id=MOONSTREAM_MOONWORM_TASKS_JOURNAL,
entries_tags=entries_tags_delete,
timeout=BUGOUT_REQUEST_TIMEOUT_SECONDS,
)
except Exception as e:
logger.error(f"Failed to delete tags from entries: {e}")
return new_entreis_state
def moonworm_crawler_update_job_as_pickedup(
event_crawl_jobs: List[EventCrawlJob],
function_call_crawl_jobs: List[FunctionCallCrawlJob],
) -> Tuple[List[EventCrawlJob], List[FunctionCallCrawlJob]]:
"""
Apply jobs of moonworm as taked to process
"""
if len(event_crawl_jobs) > 0:
event_crawl_jobs = update_job_state_with_filters( # type: ignore
events=event_crawl_jobs,
address_filter=[],
required_tags=[
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['historical_crawl_status']}:{HISTORICAL_CRAWLER_STATUSES['pending']}",
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['moonworm_status']}:False",
],
tags_to_add=["moonworm_task_pickedup:True"],
tags_to_delete=["moonworm_task_pickedup:False"],
)
if len(function_call_crawl_jobs) > 0:
function_call_crawl_jobs = update_job_state_with_filters( # type: ignore
events=function_call_crawl_jobs,
address_filter=[],
required_tags=[
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['historical_crawl_status']}:{HISTORICAL_CRAWLER_STATUSES['pending']}",
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['moonworm_status']}:False",
],
tags_to_add=[
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['moonworm_status']}:True"
],
tags_to_delete=[
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['moonworm_status']}:False"
],
)
return event_crawl_jobs, function_call_crawl_jobs
def update_job_tags(
events: Union[List[EventCrawlJob], List[FunctionCallCrawlJob]],
new_entreis_state: BugoutJournalEntries,
):
"""
Update tags of the jobs in job object
"""
entry_tags_by_id = {entry.id: entry.tags for entry in new_entreis_state.entries}
for event in events:
if isinstance(event, EventCrawlJob):
for contract_address, entries_ids in event.address_entries.items():
for entry_id in entries_ids.keys():
if entry_id in entry_tags_by_id:
event.address_entries[contract_address][entry_id] = (
entry_tags_by_id[entry_id]
)
if isinstance(event, FunctionCallCrawlJob):
for entry_id in event.entries_tags.keys():
if entry_id in entry_tags_by_id:
event.entries_tags[entry_id] = entry_tags_by_id[entry_id]
return events
def update_job_state_with_filters(
events: Union[List[EventCrawlJob], List[FunctionCallCrawlJob]],
address_filter: List[ChecksumAddress],
required_tags: List[str],
tags_to_add: List[str] = [],
tags_to_delete: List[str] = [],
) -> Union[List[EventCrawlJob], List[FunctionCallCrawlJob]]:
"""
Function that updates the state of the job in bugout.
"""
entries_ids_to_update: List[UUID] = []
### TODO: refactor this function
if len(tags_to_add) == 0 and len(tags_to_delete) == 0:
return events
for event in events:
# events
if isinstance(event, EventCrawlJob):
for contract_address, entries_ids in event.address_entries.items():
if address_filter and contract_address not in address_filter:
continue
for entry_id, tags in entries_ids.items():
if set(required_tags).issubset(set(tags)):
entries_ids_to_update.append(entry_id)
# functions
if isinstance(event, FunctionCallCrawlJob):
if address_filter and event.contract_address not in address_filter:
continue
for entry_id, tags in event.entries_tags.items():
if set(required_tags).issubset(set(tags)):
entries_ids_to_update.append(entry_id)
if len(entries_ids_to_update) == 0:
return events
new_entries_state = bugout_state_update(
entries_tags_add=[
{"entry_id": entry_id, "tags": tags_to_add}
for entry_id in entries_ids_to_update
],
entries_tags_delete=[
{"entry_id": entry_id, "tags": tags_to_delete}
for entry_id in entries_ids_to_update
],
)
events = update_job_tags(events, new_entries_state)
return events
def update_entries_status_and_progress(
events: Union[List[EventCrawlJob], List[FunctionCallCrawlJob]],
progess_map: Dict[ChecksumAddress, float],
) -> Union[List[EventCrawlJob], List[FunctionCallCrawlJob]]:
"""
Update entries status and progress in mooncrawl bugout journal
"""
entries_tags_delete: List[Dict[str, Any]] = []
entries_tags_add: List[Dict[str, Any]] = []
for event in events:
if isinstance(event, EventCrawlJob):
for contract_address, entries_ids in event.address_entries.items():
progress = round(progess_map.get(contract_address, 0), 4) * 100
(
entries_tags_delete,
entries_tags_add,
) = add_progress_to_tags(
entries=entries_ids,
contract_progress=progress,
entries_tags_delete=entries_tags_delete,
entries_tags_add=entries_tags_add,
)
if isinstance(event, FunctionCallCrawlJob):
progress = round(progess_map.get(event.contract_address, 0), 4) * 100
(
entries_tags_delete,
entries_tags_add,
) = add_progress_to_tags(
entries=event.entries_tags,
contract_progress=progress,
entries_tags_delete=entries_tags_delete,
entries_tags_add=entries_tags_add,
)
new_entries_state = bugout_state_update(
entries_tags_add=entries_tags_add,
entries_tags_delete=entries_tags_delete,
)
events = update_job_tags(events, new_entries_state)
return events
def add_progress_to_tags(
entries: Dict[UUID, List[str]],
contract_progress: float,
entries_tags_delete: List[Dict[str, Any]],
entries_tags_add: List[Dict[str, Any]],
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Calculate progress and add finished tag if progress is 100
"""
new_progress = f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['progress_status']}:{contract_progress}"
for entry_id, tags in entries.items():
# progress
if (
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['historical_crawl_status']}:{HISTORICAL_CRAWLER_STATUSES['finished']}"
in tags
):
continue
if new_progress not in tags:
entries_tags_delete.append(
{
"entry_id": entry_id,
"tags": [
tag
for tag in tags
if tag.startswith(
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['progress_status']}"
)
],
}
)
entries_tags_add.append(
{
"entry_id": entry_id,
"tags": [new_progress],
}
)
if contract_progress >= 100:
entries_tags_add.append(
{
"entry_id": entry_id,
"tags": [
f"{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['historical_crawl_status']}:{HISTORICAL_CRAWLER_STATUSES['finished']}"
],
}
)
return entries_tags_delete, entries_tags_add