WIP moonworm crawler

2021-12-07 17:39:33 +03:00 · 2021-12-07 17:39:33 +03:00 · 20291871a5
commit 20291871a5
--- a/crawlers/mooncrawl/mooncrawl/moonworm/init.py
+++ b/crawlers/mooncrawl/mooncrawl/moonworm/init.py
--- a/crawlers/mooncrawl/mooncrawl/moonworm/crawler.py
+++ b/crawlers/mooncrawl/mooncrawl/moonworm/crawler.py
@ -0,0 +1,101 @@
+import logging
+from typing import Any, Dict, List
+import json
+from mooncrawl.data import AvailableBlockchainType
+from moonstreamdb.models import Base
+
+from sqlalchemy.orm.session import Session
+from web3.main import Web3
+import time
+from ..settings import (
+    MOONSTREAM_DATA_JOURNAL_ID,
+    bugout_client,
+    MOONSTREAM_ADMIN_ACCESS_TOKEN,
+)
+from ..blockchain import connect
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def _retry_connect_web3(
+    blockchain_type: AvailableBlockchainType,
+    retry_count: int = 10,
+    sleep_time: float = 5,
+) -> Web3:
+    """
+    Retry connecting to the blockchain.
+    """
+    while retry_count > 0:
+        retry_count -= 1
+        try:
+            web3 = connect(blockchain_type)
+            web3.eth.block_number
+            logger.info(f"Connected to {blockchain_type}")
+            return web3
+        except Exception as e:
+            if retry_count == 0:
+                error = e
+                break
+            logger.error(f"Failed to connect to {blockchain_type} blockchain: {e}")
+            logger.info(f"Retrying in {sleep_time} seconds")
+            time.sleep(sleep_time)
+    raise Exception(
+        f"Failed to connect to {blockchain_type} blockchain after {retry_count} retries: {error}"
+    )
+
+
+def _get_heartbeat_entry_id(crawler_type: str) -> str:
+    entries = bugout_client.search(
+        token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
+        journal_id=MOONSTREAM_DATA_JOURNAL_ID,
+        query=f"#{crawler_type} #heartbeat",
+        limit=1,
+    )
+    if entries.results:
+        return entries.results[0].entry_url.split("/")[-1]
+    else:
+        logger.info(f"No {crawler_type} heartbeat entry found, creating one")
+        entry = bugout_client.create_entry(
+            token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
+            journal_id=MOONSTREAM_DATA_JOURNAL_ID,
+            title=f"{crawler_type} Heartbeat",
+            tags=[crawler_type, "heartbeat"],
+            content="",
+        )
+        return str(entry.id)
+
+
+def heartbeat(crawler_type: str, crawler_status: Dict[str, Any]) -> None:
+    """
+    Periodically crawler will update the status in bugout entry:
+    - Started at timestamp
+    - Started at block number
+    - Status: Running/Dead
+    - Last crawled block number
+    - Number of current jobs
+    - Time taken to crawl last crawl_step and speed per block
+
+    and other information later will be added.
+    """
+    heartbeat_entry_id = _get_heartbeat_entry_id(crawler_type)
+    bugout_client.update_entry_content(
+        token=MOONSTREAM_ADMIN_ACCESS_TOKEN,
+        journal_id=MOONSTREAM_DATA_JOURNAL_ID,
+        entry_id=heartbeat_entry_id,
+        title=f"{crawler_type} Heartbeat",
+        content=f"{json.dumps(crawler_status, indent=2)}",
+    )
+
+
+def save_labels(db_session: Session, labels: List[Base]) -> None:
+    """
+    Save labels in the database.
+    """
+    try:
+        db_session.add_all(labels)
+        db_session.commit()
+    except Exception as e:
+        logger.error(f"Failed to save labels: {e}")
+        db_session.rollback()
+        raise e
--- a/crawlers/mooncrawl/mooncrawl/moonworm/event_crawler.py
+++ b/crawlers/mooncrawl/mooncrawl/moonworm/event_crawler.py
@ -0,0 +1,221 @@
+import time
+from typing import Any, Dict, List, Optional, Union
+from dataclasses import dataclass
+import logging
+
+from moonworm.crawler.log_scanner import _fetch_events_chunk
+from sqlalchemy.orm.session import Session
+from web3 import Web3
+from eth_typing.evm import ChecksumAddress
+
+from ..settings import CRAWLER_LABEL
+
+from db.moonstreamdb.models import Base
+
+from ..blockchain import (
+    get_block_model,
+    get_label_model,
+    connect,
+)
+
+from ..data import AvailableBlockchainType
+from .crawler import save_labels
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class EventCrawlJob:
+    event_abi_hash: str
+    event_abi: Dict[str, Any]
+    contracts: List[ChecksumAddress]
+    created_at: int
+
+
+@dataclass
+class Event:
+    event_name: str
+    args: Dict[str, Any]
+    address: str
+    block_number: int
+    block_timestamp: int
+    transaction_hash: str
+    log_index: int
+
+
+def _event_to_label(blockchain_type: AvailableBlockchainType, event: Event) -> Base:
+    """
+    Creates a label model.
+    """
+    label_model = get_label_model(blockchain_type)
+    label = label_model(
+        label=CRAWLER_LABEL,
+        label_data={
+            "type": "event",
+            "name": event.event_name,
+            "args": event.args,
+        },
+        address=event.address,
+        block_number=event.block_number,
+        block_timestamp=event.block_timestamp,
+        transaction_hash=event.transaction_hash,
+        log_index=event.log_index,
+    )
+    return label
+
+
+def _get_block_timestamp_from_web3(web3: Web3, block_number: int) -> int:
+    """
+    Gets the timestamp of a block from the blockchain.
+    will raise an exception if the block is not found.
+    """
+    return web3.eth.getBlock(block_number).timestamp
+
+
+# I am using blocks_cache as the argument, to reuse this function in tx_call crawler
+# and support one cashe for both tx_call and event_crawler
+def get_block_timestamp(
+    db_session: Session,
+    blockchain_type: AvailableBlockchainType,
+    block_number: int,
+    blocks_cache: Dict[int, int],
+    max_blocks_batch: int = 30,
+) -> int:
+    """
+    Get the timestamp of a block.
+    First tries to get the block from the cache,
+    then tries to get the block from the db,
+    then tries to get it from the blockchain.
+
+    After the call cache is updated.
+    If the cache grows too large, it is cleared.
+
+    :param block_number: The block number.
+    :param max_blocks_batch: The maximum number of blocks to fetch in a single batch from db query.
+    :param blocks_cache: The cache of blocks.
+    :return: The timestamp of the block.
+    """
+    assert max_blocks_batch > 0
+
+    if block_number in blocks_cache:
+        return blocks_cache[block_number]
+
+    block_model = get_block_model(blockchain_type)
+
+    blocks = (
+        db_session.query(block_model)
+        .filter_by(block_model.block_number >= block_number)
+        .filter_by(block_model.block_number <= block_number + max_blocks_batch - 1)
+        .order_by(block_model.block_number.asc())
+        .all()
+    )
+
+    target_block_timestamp: Optional[int] = None
+    if blocks and blocks[0].block_number == block_number:
+        target_block_timestamp = blocks[0].timestamp
+
+    if target_block_timestamp is None:
+        target_block_timestamp = _get_block_timestamp_from_web3(
+            connect(blockchain_type), block_number
+        )
+
+    if len(blocks_cache) > max_blocks_batch * 2:
+        blocks_cache.clear()
+
+    blocks_cache[block_number] = target_block_timestamp
+    for block in blocks:
+        blocks_cache[block.block_number] = block.timestamp
+
+    return target_block_timestamp
+
+
+def _crawl_events(
+    db_session: Session,
+    blockchain_type: AvailableBlockchainType,
+    web3: Web3,
+    jobs: List[EventCrawlJob],
+    from_block: int,
+    to_block: int,
+    blocks_cache: Dict[int, int] = {},
+    db_block_query_batch=10,
+) -> List[Event]:
+    all_events = []
+    for job in jobs:
+        raw_events = _fetch_events_chunk(
+            web3,
+            job.event_abi,
+            from_block,
+            to_block,
+            job.contracts,
+            on_decode_error=lambda e: print(
+                f"Error decoding event: {e}"
+            ),  # TODO report via humbug
+        )
+        for raw_event in raw_events:
+            raw_event["blockTimestamp"] = get_block_timestamp(
+                db_session,
+                blockchain_type,
+                raw_event["blockNumber"],
+                blocks_cache,
+                db_block_query_batch,
+            )
+            event = Event(
+                event_name=raw_event["event"],
+                args=raw_event["args"],
+                address=raw_event["address"],
+                block_number=raw_event["blockNumber"],
+                block_timestamp=raw_event["blockTimestamp"],
+                transaction_hash=raw_event["transactionHash"],
+                log_index=raw_event["logIndex"],
+            )
+            all_events.append(event)
+
+    return all_events
+
+
+def continious_event_crawler(
+    db_session: Session,
+    blockchain_type: AvailableBlockchainType,
+    web3: Web3,
+    initial_jobs: List[EventCrawlJob],
+    start_block: int,
+    max_blocks_batch: int = 100,
+    min_blocks_batch: int = 10,
+    confirmations: int = 60,
+    min_sleep_time: float = 0.1,
+):
+
+    assert min_blocks_batch < max_blocks_batch
+
+    crawl_start_time = int(time.time())
+    blocks_cache: Dict[int, int] = {}
+
+    while True:
+        # query db  with limit 1, to avoid session closing
+        db_session.execute("SELECT 1")
+        time.sleep(min_sleep_time)
+
+        end_block = min(
+            web3.eth.blockNumber - confirmations,
+            start_block + max_blocks_batch,
+        )
+
+        if start_block + min_blocks_batch > end_block:
+            min_sleep_time *= 2
+            continue
+
+        min_sleep_time /= 2
+
+        all_events = _crawl_events(
+            db_session=db_session,
+            blockchain_type=blockchain_type,
+            web3=web3,
+            jobs=initial_jobs,
+            from_block=start_block,
+            to_block=end_block,
+            blocks_cache=blocks_cache,
+            db_block_query_batch=min_blocks_batch * 2,
+        )
+
+        # TODO ask for new jobs
--- a/crawlers/mooncrawl/setup.py
+++ b/crawlers/mooncrawl/setup.py
@ -38,6 +38,7 @@ setup(
        "chardet",
        "fastapi",
        "moonstreamdb",
+        "moonworm==0.0.5",
        "humbug",
        "pydantic",
        "python-dateutil",