From 5ed97728526ef8af5a8e8e1db09d78784c2b8d98 Mon Sep 17 00:00:00 2001 From: yhtiyar Date: Sun, 7 Nov 2021 01:27:26 +0300 Subject: [PATCH] working version of contract deploy crawler --- .../mooncrawl/mooncrawl/contract/__init__.py | 0 crawlers/mooncrawl/mooncrawl/contract/cli.py | 177 ++++++++++++ .../mooncrawl/contract/deployment_crawler.py | 270 ++++++++++++++++++ crawlers/mooncrawl/requirements.txt | 2 +- crawlers/mooncrawl/setup.py | 1 + 5 files changed, 449 insertions(+), 1 deletion(-) create mode 100644 crawlers/mooncrawl/mooncrawl/contract/__init__.py create mode 100644 crawlers/mooncrawl/mooncrawl/contract/cli.py create mode 100644 crawlers/mooncrawl/mooncrawl/contract/deployment_crawler.py diff --git a/crawlers/mooncrawl/mooncrawl/contract/__init__.py b/crawlers/mooncrawl/mooncrawl/contract/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawlers/mooncrawl/mooncrawl/contract/cli.py b/crawlers/mooncrawl/mooncrawl/contract/cli.py new file mode 100644 index 00000000..03d9dc33 --- /dev/null +++ b/crawlers/mooncrawl/mooncrawl/contract/cli.py @@ -0,0 +1,177 @@ +import argparse +import json +import logging +import time +from typing import Optional + +from moonstreamdb.db import yield_db_session_ctx +from sqlalchemy.orm.session import Session +from web3 import Web3 + +from .deployment_crawler import ContractDeploymentCrawler, MoonstreamDataStore +from ..ethereum import connect + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +SLEEP_TIME = 5 * 60 + + +def runc_crawler_asc( + w3: Web3, + session: Session, + from_block: Optional[int], + to_block: Optional[int], + synchronize: bool, + batch_size: int, + respect_state: bool, +): + """ + Runs crawler in ascending order + """ + moonstream_data_store = MoonstreamDataStore(session) + contract_deployment_crawler = ContractDeploymentCrawler(w3, moonstream_data_store) + + if respect_state: + from_block = moonstream_data_store.get_last_labeled_block_number() + 1 + logger.info(f"Respecting state, starting from block {from_block}") + + if from_block is None: + from_block = moonstream_data_store.get_first_block_number() + logger.info(f"Starting block set to : {from_block}") + + if to_block is None: + to_block = moonstream_data_store.get_last_block_number() + logger.info(f"Ending block set to : {to_block}") + + assert from_block <= to_block, "from_block must be less than or equal to to_block" + + logger.info(f"Starting crawling from block {from_block} to block {to_block}") + contract_deployment_crawler.crawl( + from_block=from_block, + to_block=to_block, + batch_size=batch_size, + ) + if synchronize: + last_crawled_block = to_block + while True: + contract_deployment_crawler.crawl( + from_block=last_crawled_block + 1, + to_block=None, + batch_size=batch_size, + ) + time.sleep(SLEEP_TIME) + + +def runc_crawler_desc( + w3: Web3, + session: Session, + from_block: Optional[int], + to_block: Optional[int], + synchronize: bool, + batch_size: int, + respect_state: bool, +): + """ + Runs crawler in descending order + """ + moonstream_data_store = MoonstreamDataStore(session) + contract_deployment_crawler = ContractDeploymentCrawler(w3, moonstream_data_store) + + if respect_state: + to_block = moonstream_data_store.get_first_block_number() - 1 + logger.info(f"Respecting state, ending at block {to_block}") + + if from_block is None: + from_block = moonstream_data_store.get_last_block_number() + logger.info(f"Starting block set to : {from_block}") + + if to_block is None: + to_block = moonstream_data_store.get_first_block_number() + logger.info(f"Ending block set to : {to_block}") + + assert ( + from_block >= to_block + ), "from_block must be greater than or equal to to_block" + + logger.info(f"Starting crawling from block {from_block} to block {to_block}") + contract_deployment_crawler.crawl( + from_block=from_block, + to_block=to_block, + batch_size=batch_size, + ) + if synchronize: + # It will lead to holes if crawler shutted down not clearly and --respect-state will be problem, + # since crawler's crawl step is implemented in asc order. Maybe later we can implement desc order + raise ValueError("Synchronize not implemented for descending order") + + +def handle_parser(args: argparse.Namespace): + with yield_db_session_ctx() as session: + w3 = connect() + if args.order == "asc": + runc_crawler_asc( + w3=w3, + session=session, + from_block=args.start, + to_block=args.to, + synchronize=args.synchronize, + batch_size=args.batch, + respect_state=args.respect_state, + ) + elif args.order == "desc": + runc_crawler_desc( + w3=w3, + session=session, + from_block=args.start, + to_block=args.to, + synchronize=args.synchronize, + batch_size=args.batch, + respect_state=args.respect_state, + ) + else: + raise ValueError(f"Invalid order {args.order}") + + +def generate_parser(): + """ + --start, -s: block to start crawling from, default: minimum block from database + --to, -t: block to stop crawling at, default: maximum block from database + --order: order to crawl : (desc, asc) default: asc + --synchronize: Continious crawling, default: False + --batch, -b : batch size, default: 10 + --respect-state: If set to True:\n If order is asc: start=last_labeled_block+1\n If order is desc: start=first_labeled_block-1 + """ + + parser = argparse.ArgumentParser(description="Moonstream Deployment Crawler") + parser.add_argument( + "--start", "-s", type=int, default=None, help="block to start crawling from" + ) + parser.add_argument( + "--to", "-t", type=int, default=None, help="block to stop crawling at" + ) + parser.add_argument( + "--order", type=str, default="asc", help="order to crawl : (desc, asc)" + ) + parser.add_argument( + "--synchronize", action="store_true", default=False, help="Continious crawling" + ) + parser.add_argument("--batch", "-b", type=int, default=10, help="batch size") + parser.add_argument( + "--respect-state", + action="store_true", + default=False, + help="If set to True:\n If order is asc: start=last_labeled_block+1\n If order is desc: start=first_labeled_block-1", + ) + parser.set_defaults(func=handle_parser) + return parser + + +def main(): + parser = generate_parser() + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/crawlers/mooncrawl/mooncrawl/contract/deployment_crawler.py b/crawlers/mooncrawl/mooncrawl/contract/deployment_crawler.py new file mode 100644 index 00000000..de38aab8 --- /dev/null +++ b/crawlers/mooncrawl/mooncrawl/contract/deployment_crawler.py @@ -0,0 +1,270 @@ +from dataclasses import dataclass +import logging +from typing import List, Optional, Tuple, cast +from hexbytes import HexBytes + +from moonstreamdb.models import EthereumBlock, EthereumTransaction, EthereumLabel +from sqlalchemy.orm import Session, Query +from web3 import Web3 +from web3.types import TxReceipt + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@dataclass +class ContractDeployment: + address: str + block_number: int + transaction_hash: str + deployer_address: str + block_timestamp: int + gas_used: int + gas_price: int + transaction_fee: int + + +@dataclass +class RawDeploymentTx: + transaction_hash: str + gas_price: int + timestamp: int + + +class MoonstreamDataStore: + def __init__(self, db_session: Session) -> None: + self.db_session = db_session + self.label = "contract_deployment" + + def get_last_labeled_block_number( + self, + ) -> int: + """ + Returns the last block number that has been labeled. + """ + + last_block = ( + self.db_session.query(EthereumLabel) + .filter(EthereumLabel.label == self.label) + .order_by(EthereumLabel.block_number.desc()) + .first() + ) + if last_block is None: + return 0 + else: + return last_block.block_number + + def get_first_labeled_block_number(self) -> int: + """ + Returns the first block number that has been labeled. + """ + first_block = ( + self.db_session.query(EthereumLabel) + .filter(EthereumLabel.label == self.label) + .order_by(EthereumLabel.block_number) + .first() + ) + if first_block is None: + return 0 + return first_block.block_number + + def get_last_block_number(self) -> int: + """ + Returns the last block number that has been processed. + """ + last_block = ( + self.db_session.query(EthereumBlock) + .order_by(EthereumBlock.block_number.desc()) + .first() + ) + if last_block is None: + return 0 + return last_block.block_number + + def get_first_block_number(self) -> int: + """ + Returns the first block number that has been processed. + """ + first_block = ( + self.db_session.query(EthereumBlock) + .order_by(EthereumBlock.block_number) + .first() + ) + if first_block is None: + return 0 + return first_block.block_number + + def get_raw_contract_deployment_transactions( + self, from_block: int, to_block: int + ) -> List[RawDeploymentTx]: + """ + Returns a list of raw contract deployment transactions. + """ + result = ( + self.db_session.query( + EthereumTransaction.hash, + EthereumTransaction.gas_price, + EthereumBlock.timestamp, + ) + .join( + EthereumBlock, + EthereumTransaction.block_number == EthereumBlock.block_number, + ) + .filter(EthereumBlock.block_number >= from_block) + .filter(EthereumBlock.block_number <= to_block) + .filter(EthereumTransaction.to_address == None) + .all() + ) + return [ + RawDeploymentTx( + transaction_hash=row[0], + gas_price=row[1], + timestamp=row[2], + ) + for row in result + ] + + def save_contract_deployment_labels( + self, contract_deployment_list: List[ContractDeployment] + ) -> None: + """ + Saves a list of contract deployment labels. + """ + transaction_hashes = [ + contract_deployment.transaction_hash + for contract_deployment in contract_deployment_list + ] + existing_labels = ( + self.db_session.query(EthereumLabel.transaction_hash) + .filter(EthereumLabel.label == self.label) + .filter(EthereumLabel.transaction_hash.in_(transaction_hashes)) + .all() + ) + existing_labels_tx_hashes = [ + label_tx_hash[0] for label_tx_hash in existing_labels + ] + new_labels = [ + EthereumLabel( + transaction_hash=contract_deployment.transaction_hash, + block_number=contract_deployment.block_number, + block_timestamp=contract_deployment.block_timestamp, + label=self.label, + address=contract_deployment.address, + label_data={ + "deployer": contract_deployment.deployer_address, + "gasUsed": int(contract_deployment.gas_used), + "gasPrice": int(contract_deployment.gas_price), + "transactionFee": int(contract_deployment.transaction_fee), + }, + ) + for contract_deployment in contract_deployment_list + if contract_deployment.transaction_hash not in existing_labels_tx_hashes + ] + if not new_labels: + return + try: + logger.info(f"Saving {len(new_labels)} new contract deployment labels.") + self.db_session.add_all(new_labels) + self.db_session.commit() + except Exception as e: + logger.error(f"Error saving contract deployment labels: {e}") + self.db_session.rollback() + + +def get_transaction_receipt(web3: Web3, tx_hash: str) -> TxReceipt: + """ + Returns the transaction receipt for the given transaction hash. + """ + + return web3.eth.get_transaction_receipt(cast(HexBytes, tx_hash)) + + +def get_contract_deployment_transactions( + web3: Web3, + datastore: MoonstreamDataStore, + from_block: int, + to_block: int, +) -> List[ContractDeployment]: + """ + Returns a list of ContractDeployment objects for all contract deployment transactions in the given block range. + """ + logger.info( + f"Getting contract deployment transactions from {from_block} to {to_block}" + ) + contract_deployment_transactions = [] + for raw_deployment_tx in datastore.get_raw_contract_deployment_transactions( + from_block, to_block + ): + receipt = get_transaction_receipt(web3, raw_deployment_tx.transaction_hash) + if receipt is None: + continue + + contract_deployment_transactions.append( + ContractDeployment( + address=cast(str, receipt["contractAddress"]), + block_number=receipt["blockNumber"], + transaction_hash=receipt["transactionHash"].hex(), + deployer_address=receipt["from"], + block_timestamp=raw_deployment_tx.timestamp, + gas_used=receipt["gasUsed"], + gas_price=raw_deployment_tx.gas_price, + transaction_fee=receipt["gasUsed"] * raw_deployment_tx.gas_price, + ) + ) + return contract_deployment_transactions + + +class ContractDeploymentCrawler: + """ + Crawls contract deployments from MoonstreamDB transactions with the usage of web3 + to get transaction recipts + """ + + def __init__(self, web3: Web3, datastore: MoonstreamDataStore): + self.web3 = web3 + self.datastore = datastore + + def crawl( + self, from_block: Optional[int], to_block: Optional[int], batch_size: int = 200 + ) -> None: + """ + Crawls contract deployments in batches with the given batch size + If from_block is None then the first block from datastore is used as start + If to_block is None then the latest block from datastore is used + """ + if from_block is None: + from_block = self.datastore.get_first_block_number() + if to_block is None: + to_block = self.datastore.get_last_block_number() + + # Copilot is fucking awesome + for batch_from_block, batch_to_block in self.get_batch_block_range( + from_block, to_block, batch_size + ): + contract_deployment_transactions = get_contract_deployment_transactions( + self.web3, self.datastore, batch_from_block, batch_to_block + ) + self.datastore.save_contract_deployment_labels( + contract_deployment_transactions + ) + + # Function Fully Generated by copilot, looks correct, lol + def get_batch_block_range( + self, from_block: int, to_block: int, batch_size: int + ) -> List[Tuple[int, int]]: + """ + Returns a list of block ranges with the given batch size + """ + if from_block > to_block: + raise ValueError("from_block must be less than to_block") + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") + + block_ranges = [] + current_from_block = from_block + current_to_block = current_from_block + batch_size - 1 + while current_to_block <= to_block: + block_ranges.append((current_from_block, current_to_block)) + current_from_block = current_to_block + 1 + current_to_block = current_from_block + batch_size - 1 + return block_ranges diff --git a/crawlers/mooncrawl/requirements.txt b/crawlers/mooncrawl/requirements.txt index 529b1e94..82fb4aa6 100644 --- a/crawlers/mooncrawl/requirements.txt +++ b/crawlers/mooncrawl/requirements.txt @@ -12,7 +12,7 @@ chardet==4.0.0 charset-normalizer==2.0.4 click==8.0.1 cytoolz==0.11.0 --e git+https://git@github.com/bugout-dev/moonstream.git@0a771ddfbca1254be331149ccf2d162aa09b7bc0#egg=moonstreamdb&subdirectory=db +-e git+https://git@github.com/bugout-dev/moonstream.git@67fe019f1086c435dd3b58f1ade2778acc2167c7#egg=moonstreamdb&subdirectory=db eth-abi==2.1.1 eth-account==0.5.5 eth-hash==0.3.2 diff --git a/crawlers/mooncrawl/setup.py b/crawlers/mooncrawl/setup.py index bc387784..3f8d30e1 100644 --- a/crawlers/mooncrawl/setup.py +++ b/crawlers/mooncrawl/setup.py @@ -51,6 +51,7 @@ setup( "identity=mooncrawl.identity:main", "etherscan=mooncrawl.etherscan:main", "nft=mooncrawl.nft.cli:main", + "deploycrawler=mooncrawl.contract.cli:main", ] }, )