Extend leak logic with getting all tokens which have tx_calls except transfers after latest metadata crawl.

Leak-rate calculating in code depends on amount maybe updated tokens and max-recrawl cli argument.
pull/752/head
Andrey 2023-01-31 14:09:03 +02:00
rodzic 69cfe4069b
commit f9c6d29561
2 zmienionych plików z 99 dodań i 12 usunięć

Wyświetl plik

@ -17,6 +17,7 @@ from .db import (
commit_session,
get_uris_of_tokens,
get_current_metadata_for_address,
get_tokens_wich_maybe_updated,
metadata_to_label,
)
from ..settings import (
@ -31,11 +32,12 @@ batch_size = 50
def leak_of_crawled_uri(
uris: List[Optional[str]],
ids: List[Optional[str]],
leak_rate: float,
) -> List[Dict[str, Any]]:
maybe_updated: List[Optional[str]],
) -> List[Optional[str]]:
assert 0 <= leak_rate <= 1, "Leak rate must be between 0 and 1"
return [uri for uri in uris if random.random() > leak_rate]
return [id for id in ids if id not in maybe_updated or random.random() > leak_rate]
def crawl_uri(metadata_uri: str) -> Any:
@ -68,7 +70,7 @@ def crawl_uri(metadata_uri: str) -> Any:
def parse_metadata(
blockchain_type: AvailableBlockchainType, batch_size: int, leak_rate: float
blockchain_type: AvailableBlockchainType, batch_size: int, max_recrawl: int
):
"""
@ -105,7 +107,20 @@ def parse_metadata(
db_session=db_session, blockchain_type=blockchain_type, address=address
)
parsed_with_leak = leak_of_crawled_uri(already_parsed, leak_rate)
maybe_updated = get_tokens_wich_maybe_updated(
db_session=db_session, blockchain_type=blockchain_type, address=address
)
leak_rate = 0.0
if len(maybe_updated) > 0:
leak_rate = max_recrawl / len(maybe_updated)
if leak_rate > 1:
leak_rate = 1
parsed_with_leak = leak_of_crawled_uri(
already_parsed, leak_rate, maybe_updated
)
for requests_chunk in [
tokens_uri_by_address[address][i : i + batch_size]
@ -140,7 +155,7 @@ def handle_crawl(args: argparse.Namespace) -> None:
blockchain_type = AvailableBlockchainType(args.blockchain)
parse_metadata(blockchain_type, args.commit_batch_size, args.leak_rate)
parse_metadata(blockchain_type, args.commit_batch_size, args.max_recrawl)
def main() -> None:
@ -168,11 +183,11 @@ def main() -> None:
help="Amount of requests before commiting to database",
)
metadata_crawler_parser.add_argument(
"--leak-rate",
"-l",
type=float,
default=0.01,
help="Leak rate of already crawled tokens",
"--max-recrawl",
"-m",
type=int,
default=200,
help="Maximum amount of recrawling of already crawled tokens",
)
metadata_crawler_parser.set_defaults(func=handle_crawl)

Wyświetl plik

@ -6,7 +6,7 @@ from moonstreamdb.blockchain import AvailableBlockchainType, get_label_model
from sqlalchemy.orm import Session
from ..data import TokenURIs
from ..settings import VIEW_STATE_CRAWLER_LABEL, METADATA_CRAWLER_LABEL
from ..settings import VIEW_STATE_CRAWLER_LABEL, METADATA_CRAWLER_LABEL, CRAWLER_LABEL
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@ -139,3 +139,75 @@ def get_current_metadata_for_address(
result = [data[0] for data in current_metadata]
return result
def get_tokens_wich_maybe_updated(
db_session: Session, blockchain_type: AvailableBlockchainType, address: str
):
label_model = get_label_model(blockchain_type)
table = label_model.__tablename__
tokens = db_session.execute(
"""
with token_id_latest_events as (
SELECT
DISTINCT ON (
label_data -> 'args' ->> 'tokenId',
label_data ->> 'name'
) label_data -> 'args' ->> 'tokenId' as token_id,
label_data ->> 'name' as name,
block_timestamp
FROM
{}
where
label = :moonworm_label
and address = :address
and label_data->> 'type' = 'tx_call'
and label_data->>'status' = '1'
and label_data ->> 'name' not in (
'safeTransferFrom',
'approve',
'transferFrom'
)
ORDER BY
(label_data -> 'args' ->> 'tokenId') ASC,
(label_data ->> 'name') ASC,
block_timestamp :: INT DESC,
log_index :: INT DESC
),
metadata_state as (
SELECT
DISTINCT ON(label_data ->> 'token_id') label_data ->> 'token_id' as token_id,
block_timestamp
FROM
{}
WHERE
address = :address
AND label = :metadata_label
ORDER BY
label_data ->> 'token_id' ASC,
block_number :: INT DESC
)
SELECT
distinct token_id_latest_events.token_id
FROM
token_id_latest_events
JOIN metadata_state ON token_id_latest_events.token_id = metadata_state.token_id
WHERE
token_id_latest_events.block_timestamp > metadata_state.block_timestamp
""".format(
table, table
),
{
"table": table,
"metadata_label": METADATA_CRAWLER_LABEL,
"moonworm_label": CRAWLER_LABEL,
"name": "tokenURI",
},
)
result = [data[0] for data in tokens]
return result