Add update logic.

state-crawler-improvments
Andrey 2022-09-14 19:49:07 +03:00
rodzic 9bb5fb4794
commit c11ddd51ce
3 zmienionych plików z 119 dodań i 70 usunięć

Wyświetl plik

@ -54,6 +54,6 @@ class QueryDataUpdate(BaseModel):
class TokenURIs(BaseModel):
token_id: str
token_uri: str
block_number: str
block_number: int
block_timestamp: str
address: str

Wyświetl plik

@ -1,9 +1,11 @@
import argparse
import json
from time import time, sleep
from urllib.error import HTTPError
import urllib.request
import logging
from typing import Dict, Any
import requests
from typing import Any
from concurrent.futures import ThreadPoolExecutor
from moonstreamdb.blockchain import AvailableBlockchainType
@ -15,8 +17,8 @@ from moonstreamdb.db import (
from sqlalchemy.orm import sessionmaker
from .db import (
commit_session,
get_uris_of_tokens,
get_current_metadata_for_address,
get_uri_addresses,
get_not_updated_metadata_for_address,
metadata_to_label,
)
from ..settings import (
@ -41,8 +43,14 @@ def crawl_uri(token_uri_data: TokenURIs) -> Any:
result = None
while retry < 3:
try:
response = urllib.request.urlopen(token_uri_data.token_uri, timeout=5)
req = urllib.request.Request(
token_uri_data.token_uri,
None,
{
"User-agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5"
},
)
response = urllib.request.urlopen(req, timeout=5)
if response.status == 200:
result = json.loads(response.read())
@ -51,12 +59,17 @@ def crawl_uri(token_uri_data: TokenURIs) -> Any:
except HTTPError as error:
logger.error(f"request end with error statuscode: {error.code}")
logger.error(f"requested uri: {token_uri_data.token_uri}")
retry += 1
sleep(2)
continue
except Exception as err:
logger.error(err)
logger.error(f"requested uri: {token_uri_data.token_uri}")
retry += 1
sleep(2)
continue
sleep(0.5)
return result, token_uri_data
@ -81,20 +94,23 @@ def parse_metadata(blockchain_type: AvailableBlockchainType, batch_size: int):
# run crawling of levels
try:
uris_of_tokens = get_uris_of_tokens(db_session, blockchain_type)
meradata_addresses = get_uri_addresses(db_session, blockchain_type)
tokens_uri_by_address: Dict[str, Any] = {}
for address in meradata_addresses:
for token_uri_data in uris_of_tokens:
if token_uri_data.address not in tokens_uri_by_address:
tokens_uri_by_address[token_uri_data.address] = []
tokens_uri_by_address[token_uri_data.address].append(token_uri_data)
not_updated_tokens = get_not_updated_metadata_for_address(
db_session,
blockchain_type,
address=address,
)
for address in tokens_uri_by_address:
logger.info(
f"Start crawling {len(not_updated_tokens)} tokens of address {address}"
)
for requests_chunk in [
tokens_uri_by_address[address][i : i + batch_size]
for i in range(0, len(tokens_uri_by_address[address]), batch_size)
not_updated_tokens[i : i + batch_size]
for i in range(0, len(not_updated_tokens), batch_size)
]:
writed_labels = 0

Wyświetl plik

@ -1,6 +1,7 @@
import logging
import json
from typing import Dict, Any, Optional, List
from unittest import result
from moonstreamdb.blockchain import AvailableBlockchainType, get_label_model
from sqlalchemy.orm import Session
@ -60,9 +61,9 @@ def commit_session(db_session: Session) -> None:
raise e
def get_uris_of_tokens(
def get_uri_addresses(
db_session: Session, blockchain_type: AvailableBlockchainType
) -> List[TokenURIs]:
) -> List[str]:
"""
Get meatadata URIs.
@ -70,47 +71,22 @@ def get_uris_of_tokens(
label_model = get_label_model(blockchain_type)
table = label_model.__tablename__
addresses = (
db_session.query(label_model.address.distinct())
.filter(label_model.label == VIEW_STATE_CRAWLER_LABEL)
.filter(label_model.label_data["name"].astext == "tokenURI")
).all()
metadata_for_parsing = db_session.execute(
""" SELECT
DISTINCT ON(label_data -> 'inputs'-> 0 ) label_data -> 'inputs'-> 0 as token_id,
label_data -> 'result' as token_uri,
block_number as block_number,
block_timestamp as block_timestamp,
address as address
result = [address[0] for address in addresses]
FROM
{}
WHERE
label = :label
AND label_data ->> 'name' = :name
ORDER BY
label_data -> 'inputs'-> 0 ASC,
block_number :: INT DESC;
""".format(
table
),
{"table": table, "label": VIEW_STATE_CRAWLER_LABEL, "name": "tokenURI"},
)
results = [
TokenURIs(
token_id=data[0],
token_uri=data[1],
block_number=data[2],
block_timestamp=data[3],
address=data[4],
)
for data in metadata_for_parsing
]
return results
return result
def get_current_metadata_for_address(
db_session: Session, blockchain_type: AvailableBlockchainType, address: str
):
def get_not_updated_metadata_for_address(
db_session: Session,
blockchain_type: AvailableBlockchainType,
address: str,
) -> List[TokenURIs]:
"""
Get existing metadata.
"""
@ -120,22 +96,79 @@ def get_current_metadata_for_address(
table = label_model.__tablename__
current_metadata = db_session.execute(
""" SELECT
DISTINCT ON(label_data ->> 'token_id') label_data ->> 'token_id' as token_id
FROM
{}
WHERE
address = :address
AND label = :label
ORDER BY
label_data ->> 'token_id' ASC,
block_number :: INT DESC;
""".format(
table
""" with current_tokens_uri as (
SELECT
DISTINCT ON((label_data -> 'inputs' -> 0) :: int) (label_data -> 'inputs' -> 0) :: text as token_id,
label_data ->> 'result' as token_uri,
block_number,
address,
block_timestamp
from
{}
where
label = :view_state_label
AND address = :address
and label_data ->> 'name' = 'tokenURI'
order by
(label_data -> 'inputs' -> 0) :: INT ASC,
block_number :: INT DESC
),
tokens_metadata as (
SELECT
DISTINCT ON((label_data ->> 'token_id') :: int) (label_data ->> 'token_id') :: text as token_id,
label_data ->>'token_uri' as token_uri,
block_number
from
{}
where
label = :metadata_label
AND address = :address
order by
(label_data ->> 'token_id') :: INT ASC,
block_number :: INT DESC
),
tokens_state as (
SELECT
current_tokens_uri.token_id,
current_tokens_uri.token_uri as state_token_uri,
current_tokens_uri.block_number as view_state_block_number,
current_tokens_uri.block_timestamp as block_timestamp,
current_tokens_uri.address as address,
tokens_metadata.block_number as metadata_block_number,
tokens_metadata.token_uri as metadata_token_uri
from
current_tokens_uri
left JOIN tokens_metadata ON current_tokens_uri.token_id = tokens_metadata.token_id
)
SELECT
token_id,
state_token_uri,
view_state_block_number,
block_timestamp,
address
from
tokens_state
where
view_state_block_number > metadata_block_number OR metadata_token_uri is null OR metadata_token_uri != state_token_uri;
""".format(
table, table
),
{"address": address, "label": METADATA_CRAWLER_LABEL},
)
{
"metadata_label": METADATA_CRAWLER_LABEL,
"view_state_label": VIEW_STATE_CRAWLER_LABEL,
"address": address,
},
).all()
result = [data[0] for data in current_metadata]
results = [
TokenURIs(
token_id=data[0],
token_uri=data[1],
block_number=data[2],
block_timestamp=data[3],
address=data[4],
)
for data in current_metadata
]
return result
return results