moonstream/crawlers/mooncrawl/mooncrawl/moonworm_crawler/cli.py

542 wiersze
17 KiB
Python
Czysty Zwykły widok Historia

2021-12-16 19:43:32 +00:00
import argparse
2021-12-16 19:46:44 +00:00
import logging
2021-12-16 20:13:00 +00:00
from typing import Optional
from uuid import UUID
2021-12-16 19:43:32 +00:00
2022-08-10 16:55:49 +00:00
from moonstreamdb.blockchain import AvailableBlockchainType
2021-12-16 19:46:44 +00:00
from web3 import Web3
2021-12-14 15:39:04 +00:00
from web3.middleware import geth_poa_middleware
from ..db import yield_db_session_ctx
2023-05-11 14:20:34 +00:00
from ..settings import (
2023-05-31 14:06:49 +00:00
HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES,
HISTORICAL_CRAWLER_STATUSES,
MOONSTREAM_MOONWORM_TASKS_JOURNAL,
2023-05-11 14:20:34 +00:00
)
2021-12-16 21:39:35 +00:00
from .continuous_crawler import _retry_connect_web3, continuous_crawler
2021-12-16 19:43:32 +00:00
from .crawler import (
2021-12-16 19:46:44 +00:00
SubscriptionTypes,
2022-05-26 13:00:35 +00:00
blockchain_type_to_subscription_type,
find_all_deployed_blocks,
2021-12-16 19:46:44 +00:00
get_crawl_job_entries,
2021-12-16 19:43:32 +00:00
make_event_crawl_jobs,
make_function_call_crawl_jobs,
2023-05-25 13:06:33 +00:00
moonworm_crawler_update_job_as_pickedup,
update_job_state_with_filters,
2021-12-16 19:43:32 +00:00
)
2022-06-16 12:53:19 +00:00
from .db import get_first_labeled_block_number, get_last_labeled_block_number
2022-06-16 12:54:19 +00:00
from .historical_crawler import historical_crawler
2021-12-16 19:43:32 +00:00
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
2021-12-14 15:39:04 +00:00
2021-12-16 19:43:32 +00:00
def handle_crawl(args: argparse.Namespace) -> None:
2022-05-26 13:00:35 +00:00
blockchain_type = AvailableBlockchainType(args.blockchain_type)
subscription_type = blockchain_type_to_subscription_type(blockchain_type)
2021-12-14 15:39:04 +00:00
initial_event_jobs = make_event_crawl_jobs(
get_crawl_job_entries(
2022-05-26 13:00:35 +00:00
subscription_type,
2021-12-14 15:39:04 +00:00
"event",
MOONSTREAM_MOONWORM_TASKS_JOURNAL,
2023-05-23 10:56:38 +00:00
)
2021-12-14 15:39:04 +00:00
)
logger.info(f"Initial event crawl jobs count: {len(initial_event_jobs)}")
initial_function_call_jobs = make_function_call_crawl_jobs(
get_crawl_job_entries(
2022-05-26 13:00:35 +00:00
subscription_type,
2021-12-14 15:39:04 +00:00
"function",
MOONSTREAM_MOONWORM_TASKS_JOURNAL,
2023-05-23 10:56:38 +00:00
)
2021-12-14 15:39:04 +00:00
)
logger.info(
f"Initial function call crawl jobs count: {len(initial_function_call_jobs)}"
)
2021-12-16 21:39:35 +00:00
2023-05-25 13:06:33 +00:00
(
initial_event_jobs,
initial_function_call_jobs,
) = moonworm_crawler_update_job_as_pickedup(
event_crawl_jobs=initial_event_jobs,
function_call_crawl_jobs=initial_function_call_jobs,
)
2023-05-23 10:56:38 +00:00
2021-12-16 21:39:35 +00:00
logger.info(f"Blockchain type: {blockchain_type.value}")
2021-12-14 15:39:04 +00:00
with yield_db_session_ctx() as db_session:
2021-12-16 20:13:00 +00:00
web3: Optional[Web3] = None
if args.web3 is None:
logger.info(
"No web3 provider URL provided, using default (blockchan.py: connect())"
)
web3 = _retry_connect_web3(blockchain_type, web3_uri=args.web3_uri)
2021-12-16 20:13:00 +00:00
else:
logger.info(f"Using web3 provider URL: {args.web3}")
2021-12-16 19:43:32 +00:00
web3 = Web3(
Web3.HTTPProvider(
args.web3,
)
2021-12-14 15:39:04 +00:00
)
2021-12-16 19:43:32 +00:00
if args.poa:
2021-12-16 20:13:00 +00:00
logger.info("Using PoA middleware")
2021-12-16 19:43:32 +00:00
web3.middleware_onion.inject(geth_poa_middleware, layer=0)
2021-12-16 20:13:00 +00:00
last_labeled_block = get_last_labeled_block_number(db_session, blockchain_type)
logger.info(f"Last labeled block: {last_labeled_block}")
2021-12-16 21:39:35 +00:00
start_block = args.start
2021-12-16 20:13:00 +00:00
if start_block is None:
logger.info("No start block provided")
if last_labeled_block is not None:
start_block = last_labeled_block - 1
logger.info(f"Using last labeled block as start: {start_block}")
else:
logger.info(
"No last labeled block found, using start block (web3.eth.blockNumber - 300)"
)
2021-12-16 21:48:36 +00:00
start_block = web3.eth.blockNumber - 10000
2021-12-16 20:13:00 +00:00
logger.info(f"Starting from block: {start_block}")
2021-12-16 20:17:37 +00:00
elif last_labeled_block is not None:
if start_block < last_labeled_block and not args.force:
logger.info(
f"Start block is less than last labeled block, using last labeled block: {last_labeled_block}"
)
logger.info(
f"Use --force to override this and start from the start block: {start_block}"
)
start_block = last_labeled_block
else:
logger.info(f"Using start block: {start_block}")
else:
logger.info(f"Using start block: {start_block}")
2023-03-07 14:16:01 +00:00
confirmations = args.confirmations
if not args.no_confirmations:
assert confirmations > 0, "confirmations must be greater than 0"
else:
confirmations = 0
2021-12-16 19:43:32 +00:00
continuous_crawler(
2021-12-14 15:39:04 +00:00
db_session,
2021-12-16 20:13:00 +00:00
blockchain_type,
2021-12-14 15:39:04 +00:00
web3,
2021-12-16 19:43:32 +00:00
initial_event_jobs,
2021-12-14 15:39:04 +00:00
initial_function_call_jobs,
2021-12-16 19:43:32 +00:00
start_block,
args.max_blocks_batch,
args.min_blocks_batch,
2023-03-07 14:16:01 +00:00
confirmations,
2021-12-16 19:43:32 +00:00
args.min_sleep_time,
args.heartbeat_interval,
args.new_jobs_refetch_interval,
web3_uri=args.web3_uri,
2021-12-14 15:39:04 +00:00
)
2022-06-16 12:53:19 +00:00
def handle_historical_crawl(args: argparse.Namespace) -> None:
blockchain_type = AvailableBlockchainType(args.blockchain_type)
subscription_type = blockchain_type_to_subscription_type(blockchain_type)
2023-05-11 14:20:34 +00:00
extend_tags = []
addresses_filter = []
if args.address is not None:
addresses_filter = [Web3.toChecksumAddress(args.address)]
2023-05-11 14:20:34 +00:00
if args.tasks_journal:
addresses_filter = []
extend_tags.extend(
[
2023-05-30 14:34:49 +00:00
"#moonworm_task_pickedup:True",
2023-05-31 14:06:49 +00:00
f"!#{HISTORICAL_CRAWLER_STATUS_TAG_PREFIXES['historical_crawl_status']}:{HISTORICAL_CRAWLER_STATUSES['finished']}",
2023-05-11 14:20:34 +00:00
]
)
2022-06-16 12:53:19 +00:00
all_event_jobs = make_event_crawl_jobs(
get_crawl_job_entries(
subscription_type,
"event",
MOONSTREAM_MOONWORM_TASKS_JOURNAL,
2023-05-11 14:20:34 +00:00
extend_tags=extend_tags,
2022-06-16 12:53:19 +00:00
)
)
2023-05-11 14:20:34 +00:00
2022-06-16 12:53:19 +00:00
filtered_event_jobs = []
for job in all_event_jobs:
2023-05-11 14:20:34 +00:00
if addresses_filter and not args.tasks_journal:
intersection = [
address for address in job.contracts if address in addresses_filter
]
else:
intersection = job.contracts
2022-06-16 12:53:19 +00:00
if intersection:
job.contracts = intersection
filtered_event_jobs.append(job)
logger.info(f"Filtered event crawl jobs count: {len(filtered_event_jobs)}")
all_function_call_jobs = make_function_call_crawl_jobs(
get_crawl_job_entries(
subscription_type,
"function",
MOONSTREAM_MOONWORM_TASKS_JOURNAL,
2023-05-11 14:20:34 +00:00
extend_tags=extend_tags,
2022-06-16 12:53:19 +00:00
)
)
2023-05-11 14:20:34 +00:00
if addresses_filter:
2023-07-27 06:22:23 +00:00
filtered_function_call_jobs = [
job
for job in all_function_call_jobs
if job.contract_address in addresses_filter
]
else:
filtered_function_call_jobs = all_function_call_jobs
2022-06-16 12:53:19 +00:00
2023-05-11 14:20:34 +00:00
# get set of addresses from event jobs and function call jobs
2022-06-21 11:31:26 +00:00
if args.only_events:
filtered_function_call_jobs = []
logger.info(f"Removing function call crawl jobs since --only-events is set")
2023-05-23 10:56:38 +00:00
if args.only_functions:
filtered_event_jobs = []
logger.info(
f"Removing event crawl jobs since --only-functions is set. Function call jobs count: {len(filtered_function_call_jobs)}"
)
if args.only_events and args.only_functions:
raise ValueError(
"--only-events and --only-functions cannot be set at the same time"
)
if args.tasks_journal:
if len(filtered_event_jobs) > 0:
filtered_event_jobs = update_job_state_with_filters( # type: ignore
events=filtered_event_jobs,
address_filter=[],
required_tags=[
"historical_crawl_status:pending",
2023-05-25 13:06:33 +00:00
"moonworm_task_pickedup:True",
2023-05-23 10:56:38 +00:00
],
tags_to_add=["historical_crawl_status:in_progress"],
tags_to_delete=["historical_crawl_status:pending"],
)
if len(filtered_function_call_jobs) > 0:
filtered_function_call_jobs = update_job_state_with_filters( # type: ignore
2023-05-25 13:06:33 +00:00
events=filtered_function_call_jobs,
2023-05-23 10:56:38 +00:00
address_filter=[],
required_tags=[
"historical_crawl_status:pending",
2023-05-25 13:06:33 +00:00
"moonworm_task_pickedup:True",
2023-05-23 10:56:38 +00:00
],
tags_to_add=["historical_crawl_status:in_progress"],
tags_to_delete=["historical_crawl_status:pending"],
)
2022-06-16 12:53:19 +00:00
logger.info(
f"Initial function call crawl jobs count: {len(filtered_function_call_jobs)}"
)
logger.info(f"Blockchain type: {blockchain_type.value}")
with yield_db_session_ctx() as db_session:
web3: Optional[Web3] = None
if args.web3 is None:
logger.info(
"No web3 provider URL provided, using default (blockchan.py: connect())"
)
web3 = _retry_connect_web3(blockchain_type, web3_uri=args.web3_uri)
2022-06-16 12:53:19 +00:00
else:
logger.info(f"Using web3 provider URL: {args.web3}")
web3 = Web3(
Web3.HTTPProvider(
args.web3,
)
)
if args.poa:
logger.info("Using PoA middleware")
web3.middleware_onion.inject(geth_poa_middleware, layer=0)
last_labeled_block = get_first_labeled_block_number(
2022-06-22 12:34:02 +00:00
db_session, blockchain_type, args.address, only_events=args.only_events
2022-06-16 12:53:19 +00:00
)
logger.info(f"Last labeled block: {last_labeled_block}")
2023-05-23 10:56:38 +00:00
addresses_deployment_blocks = None
2023-05-30 14:13:18 +00:00
end_block = args.end
2023-07-27 06:22:23 +00:00
start_block = args.start
2023-05-23 10:56:38 +00:00
# get set of addresses from event jobs and function call jobs
if args.find_deployed_blocks:
addresses_set = set()
for job in filtered_event_jobs:
addresses_set.update(job.contracts)
for function_job in filtered_function_call_jobs:
addresses_set.add(function_job.contract_address)
if args.start is None:
start_block = web3.eth.blockNumber - 1
addresses_deployment_blocks = find_all_deployed_blocks(
2023-05-25 13:06:33 +00:00
web3, list(addresses_set)
2023-05-11 14:20:34 +00:00
)
2023-05-29 13:29:45 +00:00
if len(addresses_deployment_blocks) == 0:
2023-05-31 14:30:10 +00:00
logger.error(
2023-05-29 13:29:45 +00:00
"No addresses found in the blockchain. Please check your addresses and try again"
)
2023-05-31 14:30:10 +00:00
return
2023-05-23 10:56:38 +00:00
end_block = min(addresses_deployment_blocks.values())
2023-05-11 14:20:34 +00:00
2022-06-16 12:53:19 +00:00
if start_block is None:
logger.info("No start block provided")
if last_labeled_block is not None:
start_block = last_labeled_block
logger.info(f"Using last labeled block as start: {start_block}")
else:
logger.info(
"No last labeled block found, using start block (web3.eth.blockNumber - 300)"
)
raise ValueError(
"No start block provided and no last labeled block found"
)
elif last_labeled_block is not None:
if start_block > last_labeled_block and not args.force:
logger.info(
f"Start block is less than last labeled block, using last labeled block: {last_labeled_block}"
)
logger.info(
f"Use --force to override this and start from the start block: {start_block}"
)
start_block = last_labeled_block
else:
logger.info(f"Using start block: {start_block}")
else:
logger.info(f"Using start block: {start_block}")
2023-05-11 14:20:34 +00:00
if start_block < end_block:
2022-06-16 12:53:19 +00:00
raise ValueError(
2023-05-11 14:20:34 +00:00
f"Start block {start_block} is less than end block {end_block}. This crawler crawls in the reverse direction."
2022-06-16 12:53:19 +00:00
)
historical_crawler(
db_session,
blockchain_type,
web3,
filtered_event_jobs,
filtered_function_call_jobs,
start_block,
2023-05-11 14:20:34 +00:00
end_block,
2022-06-16 12:53:19 +00:00
args.max_blocks_batch,
args.min_sleep_time,
web3_uri=args.web3_uri,
2023-05-23 10:56:38 +00:00
addresses_deployment_blocks=addresses_deployment_blocks,
2022-06-16 12:53:19 +00:00
)
2021-12-16 20:13:00 +00:00
def main() -> None:
2021-12-16 19:43:32 +00:00
parser = argparse.ArgumentParser()
parser.set_defaults(func=lambda _: parser.print_help())
parser.add_argument(
"--web3-uri",
help="Node JSON RPC uri",
)
2021-12-16 19:43:32 +00:00
subparsers = parser.add_subparsers()
2022-06-21 11:33:45 +00:00
crawl_parser = subparsers.add_parser(
"crawl",
help="continuous crawling the event/function call jobs from bugout journal",
)
2021-12-16 19:43:32 +00:00
crawl_parser.add_argument(
"--start",
"-s",
type=int,
default=None,
)
crawl_parser.add_argument(
"--blockchain-type",
"-b",
type=str,
2022-05-26 12:22:09 +00:00
help=f"Available blockchain types: {[member.value for member in AvailableBlockchainType]}",
2021-12-16 19:43:32 +00:00
)
crawl_parser.add_argument(
"--web3",
type=str,
default=None,
help="Web3 provider URL",
)
crawl_parser.add_argument(
"--poa",
action="store_true",
default=False,
help="Use PoA middleware",
)
crawl_parser.add_argument(
"--max-blocks-batch",
"-m",
type=int,
default=80,
2021-12-16 19:43:32 +00:00
help="Maximum number of blocks to crawl in a single batch",
)
crawl_parser.add_argument(
"--min-blocks-batch",
"-n",
type=int,
default=20,
2021-12-16 19:43:32 +00:00
help="Minimum number of blocks to crawl in a single batch",
)
crawl_parser.add_argument(
"--confirmations",
"-c",
type=int,
default=175,
2021-12-16 19:43:32 +00:00
help="Number of confirmations to wait for",
)
2023-03-07 14:16:01 +00:00
crawl_parser.add_argument(
"--no-confirmations",
action="store_true",
default=False,
help="Do not wait for confirmations explicitly set confirmations to 0",
)
2021-12-16 19:43:32 +00:00
crawl_parser.add_argument(
"--min-sleep-time",
"-t",
type=float,
2022-04-18 14:25:13 +00:00
default=0.1,
2021-12-16 19:43:32 +00:00
help="Minimum time to sleep between crawl step",
)
crawl_parser.add_argument(
"--heartbeat-interval",
"-i",
type=float,
default=60,
help="Heartbeat interval in seconds",
)
crawl_parser.add_argument(
"--new-jobs-refetch-interval",
"-r",
type=float,
default=180,
2021-12-16 19:43:32 +00:00
help="Time to wait before refetching new jobs",
)
2021-12-16 20:17:37 +00:00
crawl_parser.add_argument(
"--force",
action="store_true",
default=False,
help="Force start from the start block",
)
2021-12-16 19:43:32 +00:00
crawl_parser.set_defaults(func=handle_crawl)
2021-12-16 20:13:00 +00:00
2022-06-16 12:53:19 +00:00
historical_crawl_parser = subparsers.add_parser(
"historical-crawl", help="Crawl historical data"
)
historical_crawl_parser.add_argument(
"--address",
"-a",
required=False,
2022-06-16 12:53:19 +00:00
type=str,
)
historical_crawl_parser.add_argument(
"--start",
"-s",
type=int,
default=None,
)
historical_crawl_parser.add_argument(
"--end",
"-e",
type=int,
2023-05-30 14:13:18 +00:00
required=False,
2022-06-16 12:53:19 +00:00
)
historical_crawl_parser.add_argument(
"--blockchain-type",
"-b",
type=str,
help=f"Available blockchain types: {[member.value for member in AvailableBlockchainType]}",
)
historical_crawl_parser.add_argument(
"--web3",
type=str,
default=None,
help="Web3 provider URL",
)
historical_crawl_parser.add_argument(
"--poa",
action="store_true",
default=False,
help="Use PoA middleware",
)
historical_crawl_parser.add_argument(
"--max-blocks-batch",
"-m",
type=int,
default=80,
help="Maximum number of blocks to crawl in a single batch",
)
historical_crawl_parser.add_argument(
"--min-sleep-time",
"-t",
type=float,
default=0.1,
help="Minimum time to sleep between crawl step",
)
historical_crawl_parser.add_argument(
"--force",
action="store_true",
default=False,
help="Force start from the start block",
)
2022-06-21 11:31:26 +00:00
historical_crawl_parser.add_argument(
"--only-events",
action="store_true",
default=False,
help="Only crawl events",
)
2023-05-23 10:56:38 +00:00
historical_crawl_parser.add_argument(
"--only-functions",
action="store_true",
default=False,
help="Only crawl function calls",
)
historical_crawl_parser.add_argument(
"--find-deployed-blocks",
action="store_true",
default=False,
help="Find all deployed blocks",
)
2023-05-11 14:20:34 +00:00
historical_crawl_parser.add_argument(
"--tasks-journal",
action="store_true",
default=False,
help="Use tasks journal wich will fill all required fields for historical crawl",
)
2022-06-16 12:53:19 +00:00
historical_crawl_parser.set_defaults(func=handle_historical_crawl)
2021-12-16 20:13:00 +00:00
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()