moonstream/crawlers/mooncrawl/mooncrawl/stats_worker/queries.py

145 wiersze
4.2 KiB
Python
Czysty Zwykły widok Historia

import csv
import hashlib
2022-02-16 00:57:39 +00:00
import json
import logging
2022-03-09 15:51:49 +00:00
import re
from collections import OrderedDict
2022-02-16 16:12:42 +00:00
from io import StringIO
from typing import Any, Dict
2022-02-16 16:12:42 +00:00
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import text
from sqlalchemy.sql.expression import TextClause
from ..actions import push_data_to_bucket
from ..db import RO_pre_ping_query_engine
from ..reporter import reporter
from ..settings import MOONSTREAM_S3_QUERIES_BUCKET_PREFIX
2022-02-16 00:57:39 +00:00
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
QUERY_REGEX = re.compile(r"[\[\]@#$%^&?;`]|/\*|\*/")
2022-03-09 15:51:49 +00:00
class QueryNotValid(Exception):
"""
Raised when query validation not passed.
"""
2022-02-16 00:57:39 +00:00
2022-03-09 15:51:49 +00:00
def query_validation(query: str) -> str:
"""
Sanitize provided query.
"""
if QUERY_REGEX.search(query) != None:
raise QueryNotValid("Query contains restricted symbols")
return query
def to_json_types(value):
2022-11-30 15:31:19 +00:00
if isinstance(value, (str, int, tuple, dict, list)):
return value
elif isinstance(value, set):
return list(value)
else:
return str(value)
2022-11-29 13:06:27 +00:00
def from_json_types(value):
if isinstance(value, (str, int, tuple, dict)):
return value
2022-11-29 15:05:22 +00:00
elif isinstance(value, list): # psycopg2 issue with list support
2022-11-29 13:06:27 +00:00
return tuple(value)
else:
return str(value)
2022-11-28 14:52:30 +00:00
2022-02-16 16:12:42 +00:00
def data_generate(
query_id: str,
file_type: str,
2022-11-28 14:52:30 +00:00
bucket: str,
key: str,
query: TextClause,
params: Dict[str, Any],
2022-11-24 13:02:32 +00:00
params_hash: str,
2022-02-16 16:12:42 +00:00
):
2022-02-16 00:57:39 +00:00
"""
Generate query and push it to S3
"""
2022-02-16 16:12:42 +00:00
process_session = sessionmaker(bind=RO_pre_ping_query_engine)
db_session = process_session()
metadata = {
"source": "drone-query-generation",
"query_id": query_id,
"file_type": file_type,
2022-11-28 14:52:30 +00:00
"params_hash": params_hash,
"params": json.dumps(params),
}
try:
2022-11-24 13:13:15 +00:00
# TODO:(Andrey) Need optimization that information is usefull but incomplete
block_number, block_timestamp = db_session.execute(
text(
"SELECT block_number, block_timestamp FROM polygon_labels WHERE block_number=(SELECT max(block_number) FROM polygon_labels where label='moonworm-alpha') limit 1;"
),
2022-11-24 13:13:15 +00:00
).one()
2022-08-25 17:28:57 +00:00
if file_type == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, delimiter=";")
# engine.execution_options(stream_results=True)
query_instance = db_session.execute(query, params) # type: ignore
2022-08-25 17:28:57 +00:00
csv_writer.writerow(query_instance.keys())
csv_writer.writerows(query_instance.fetchall())
2022-08-25 17:28:57 +00:00
2022-11-24 13:13:15 +00:00
metadata["block_number"] = block_number
metadata["block_timestamp"] = block_timestamp
2022-11-28 14:52:30 +00:00
data = csv_buffer.getvalue().encode("utf-8")
2022-08-25 17:28:57 +00:00
else:
block_number, block_timestamp = db_session.execute(
text(
"SELECT block_number, block_timestamp FROM polygon_labels WHERE block_number=(SELECT max(block_number) FROM polygon_labels where label='moonworm-alpha') limit 1;"
),
2022-08-25 17:28:57 +00:00
).one()
data = json.dumps(
{
"block_number": block_number,
"block_timestamp": block_timestamp,
"data": [
{
key: to_json_types(value)
for key, value in row._asdict().items()
}
for row in db_session.execute(query, params).all()
2022-08-25 17:28:57 +00:00
],
}
).encode("utf-8")
2022-11-28 14:52:30 +00:00
push_data_to_bucket(
data=data,
key=key,
bucket=bucket,
metadata=metadata,
)
2022-08-25 17:28:57 +00:00
except Exception as err:
2022-11-29 15:05:22 +00:00
logger.error(f"Error while generating data: {err}")
2022-08-25 17:28:57 +00:00
db_session.rollback()
reporter.error_report(
err,
[
"queries",
"execution",
f"query_id:{query_id}" f"file_type:{file_type}",
],
)
finally:
db_session.close()