import argparse import json import logging import random import urllib.request from concurrent.futures import ThreadPoolExecutor from typing import Any, Dict, List, Optional from urllib.error import HTTPError from moonstreamdb.blockchain import AvailableBlockchainType from ..db import yield_db_preping_session_ctx, yield_db_read_only_preping_session_ctx from .db import ( clean_labels_from_db, get_current_metadata_for_address, get_tokens_id_wich_may_updated, get_uris_of_tokens, metadata_to_label, ) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) batch_size = 50 def leak_of_crawled_uri( ids: List[Optional[str]], leak_rate: float, maybe_updated: List[Optional[str]] ) -> List[Optional[str]]: """ Leak only uri which may be updated. """ assert 0 <= leak_rate <= 1, "Leak rate must be between 0 and 1" result = [] for id in ids: if id not in maybe_updated and random.random() > leak_rate: result.append(id) return result def crawl_uri(metadata_uri: str) -> Any: """ Get metadata from URI """ retry = 0 result = None while retry < 3: try: response = urllib.request.urlopen(metadata_uri, timeout=10) if ( metadata_uri.startswith("data:application/json") or response.status == 200 ): result = json.loads(response.read()) break retry += 1 except HTTPError as error: logger.error(f"request end with error statuscode: {error.code}") retry += 1 continue except Exception as err: logger.error(err) logger.error(f"request end with error for url: {metadata_uri}") retry += 1 continue return result def parse_metadata( blockchain_type: AvailableBlockchainType, batch_size: int, max_recrawl: int ): """ Parse all metadata of tokens. """ logger.info("Starting metadata crawler") logger.info(f"Processing blockchain {blockchain_type.value}") # run crawling of levels with yield_db_read_only_preping_session_ctx() as db_session_read_only: try: # get all tokens with uri logger.info("Requesting all tokens with uri from database") uris_of_tokens = get_uris_of_tokens(db_session_read_only, blockchain_type) tokens_uri_by_address: Dict[str, Any] = {} for token_uri_data in uris_of_tokens: if token_uri_data.address not in tokens_uri_by_address: tokens_uri_by_address[token_uri_data.address] = [] tokens_uri_by_address[token_uri_data.address].append(token_uri_data) except Exception as err: logger.error(f"Error while requesting tokens with uri from database: {err}") return for address in tokens_uri_by_address: with yield_db_read_only_preping_session_ctx() as db_session_read_only: try: already_parsed = get_current_metadata_for_address( db_session=db_session_read_only, blockchain_type=blockchain_type, address=address, ) maybe_updated = get_tokens_id_wich_may_updated( db_session=db_session_read_only, blockchain_type=blockchain_type, address=address, ) except Exception as err: logger.warning(err) logger.warning( f"Error while requesting metadata for address: {address}" ) continue with yield_db_preping_session_ctx() as db_session: try: logger.info(f"Starting to crawl metadata for address: {address}") leak_rate = 0.0 if len(maybe_updated) > 0: free_spots = len(maybe_updated) / max_recrawl if free_spots > 1: leak_rate = 0 else: leak_rate = 1 - ( len(already_parsed) - max_recrawl + len(maybe_updated) ) / len(already_parsed) parsed_with_leak = leak_of_crawled_uri( already_parsed, leak_rate, maybe_updated ) logger.info( f"Leak rate: {leak_rate} for {address} with maybe updated {len(maybe_updated)}" ) logger.info(f"Already parsed: {len(already_parsed)} for {address}") logger.info( f"Amount of state in database: {len(tokens_uri_by_address[address])} for {address}" ) logger.info( f"Amount of tokens parsed with leak: {len(parsed_with_leak)} for {address}" ) # Remove already parsed tokens new_tokens_uri_by_address = [ token_uri_data for token_uri_data in tokens_uri_by_address[address] if token_uri_data.token_id not in parsed_with_leak ] logger.info( f"Amount of tokens to parse: {len(new_tokens_uri_by_address)} for {address}" ) for requests_chunk in [ new_tokens_uri_by_address[i : i + batch_size] for i in range(0, len(new_tokens_uri_by_address), batch_size) ]: writed_labels = 0 db_session.commit() try: with db_session.begin(): for token_uri_data in requests_chunk: with ThreadPoolExecutor(max_workers=1) as executor: future = executor.submit( crawl_uri, token_uri_data.token_uri ) metadata = future.result(timeout=10) db_session.add( metadata_to_label( blockchain_type=blockchain_type, metadata=metadata, token_uri_data=token_uri_data, ) ) writed_labels += 1 if writed_labels > 0: clean_labels_from_db( db_session=db_session, blockchain_type=blockchain_type, address=address, ) logger.info( f"Write {writed_labels} labels for {address}" ) # trasaction is commited here except Exception as err: logger.warning(err) logger.warning( f"Error while writing labels for address: {address}" ) db_session.rollback() clean_labels_from_db( db_session=db_session, blockchain_type=blockchain_type, address=address, ) except Exception as err: logger.warning(err) logger.warning(f"Error while crawling metadata for address: {address}") db_session.rollback() continue def handle_crawl(args: argparse.Namespace) -> None: """ Parse all metadata of tokens. """ blockchain_type = AvailableBlockchainType(args.blockchain) parse_metadata(blockchain_type, args.commit_batch_size, args.max_recrawl) def main() -> None: parser = argparse.ArgumentParser() parser.set_defaults(func=lambda _: parser.print_help()) subparsers = parser.add_subparsers() metadata_crawler_parser = subparsers.add_parser( "crawl", help="Crawler of tokens metadata.", ) metadata_crawler_parser.add_argument( "--blockchain", "-b", type=str, help="Type of blockchain wich writng in database", required=True, ) metadata_crawler_parser.add_argument( "--commit-batch-size", "-c", type=int, default=50, help="Amount of requests before commiting to database", ) metadata_crawler_parser.add_argument( "--max-recrawl", "-m", type=int, default=300, help="Maximum amount of recrawling of already crawled tokens", ) metadata_crawler_parser.set_defaults(func=handle_crawl) args = parser.parse_args() args.func(args) if __name__ == "__main__": main()