kopia lustrzana https://github.com/bugout-dev/moonstream
145 wiersze
4.2 KiB
Python
145 wiersze
4.2 KiB
Python
import csv
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import re
|
|
from collections import OrderedDict
|
|
from io import StringIO
|
|
from typing import Any, Dict
|
|
|
|
from sqlalchemy.orm import sessionmaker
|
|
from sqlalchemy.sql import text
|
|
from sqlalchemy.sql.expression import TextClause
|
|
|
|
from ..actions import push_data_to_bucket
|
|
from ..db import RO_pre_ping_query_engine
|
|
from ..reporter import reporter
|
|
from ..settings import MOONSTREAM_S3_QUERIES_BUCKET_PREFIX
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
QUERY_REGEX = re.compile(r"[\[\]@#$%^&?;`]|/\*|\*/")
|
|
|
|
|
|
class QueryNotValid(Exception):
|
|
"""
|
|
Raised when query validation not passed.
|
|
"""
|
|
|
|
|
|
def query_validation(query: str) -> str:
|
|
"""
|
|
Sanitize provided query.
|
|
"""
|
|
if QUERY_REGEX.search(query) != None:
|
|
raise QueryNotValid("Query contains restricted symbols")
|
|
|
|
return query
|
|
|
|
|
|
def to_json_types(value):
|
|
if isinstance(value, (str, int, tuple, dict, list)):
|
|
return value
|
|
elif isinstance(value, set):
|
|
return list(value)
|
|
else:
|
|
return str(value)
|
|
|
|
|
|
def from_json_types(value):
|
|
if isinstance(value, (str, int, tuple, dict)):
|
|
return value
|
|
elif isinstance(value, list): # psycopg2 issue with list support
|
|
return tuple(value)
|
|
else:
|
|
return str(value)
|
|
|
|
|
|
def data_generate(
|
|
query_id: str,
|
|
file_type: str,
|
|
bucket: str,
|
|
key: str,
|
|
query: TextClause,
|
|
params: Dict[str, Any],
|
|
params_hash: str,
|
|
):
|
|
"""
|
|
Generate query and push it to S3
|
|
"""
|
|
|
|
process_session = sessionmaker(bind=RO_pre_ping_query_engine)
|
|
db_session = process_session()
|
|
|
|
metadata = {
|
|
"source": "drone-query-generation",
|
|
"query_id": query_id,
|
|
"file_type": file_type,
|
|
"params_hash": params_hash,
|
|
"params": json.dumps(params),
|
|
}
|
|
|
|
try:
|
|
# TODO:(Andrey) Need optimization that information is usefull but incomplete
|
|
block_number, block_timestamp = db_session.execute(
|
|
text(
|
|
"SELECT block_number, block_timestamp FROM polygon_labels WHERE block_number=(SELECT max(block_number) FROM polygon_labels where label='moonworm-alpha') limit 1;"
|
|
),
|
|
).one()
|
|
|
|
if file_type == "csv":
|
|
csv_buffer = StringIO()
|
|
csv_writer = csv.writer(csv_buffer, delimiter=";")
|
|
|
|
# engine.execution_options(stream_results=True)
|
|
query_instance = db_session.execute(query, params) # type: ignore
|
|
|
|
csv_writer.writerow(query_instance.keys())
|
|
csv_writer.writerows(query_instance.fetchall())
|
|
|
|
metadata["block_number"] = block_number
|
|
metadata["block_timestamp"] = block_timestamp
|
|
|
|
data = csv_buffer.getvalue().encode("utf-8")
|
|
|
|
else:
|
|
block_number, block_timestamp = db_session.execute(
|
|
text(
|
|
"SELECT block_number, block_timestamp FROM polygon_labels WHERE block_number=(SELECT max(block_number) FROM polygon_labels where label='moonworm-alpha') limit 1;"
|
|
),
|
|
).one()
|
|
|
|
data = json.dumps(
|
|
{
|
|
"block_number": block_number,
|
|
"block_timestamp": block_timestamp,
|
|
"data": [
|
|
{
|
|
key: to_json_types(value)
|
|
for key, value in row._asdict().items()
|
|
}
|
|
for row in db_session.execute(query, params).all()
|
|
],
|
|
}
|
|
).encode("utf-8")
|
|
push_data_to_bucket(
|
|
data=data,
|
|
key=key,
|
|
bucket=bucket,
|
|
metadata=metadata,
|
|
)
|
|
except Exception as err:
|
|
logger.error(f"Error while generating data: {err}")
|
|
db_session.rollback()
|
|
reporter.error_report(
|
|
err,
|
|
[
|
|
"queries",
|
|
"execution",
|
|
f"query_id:{query_id}" f"file_type:{file_type}",
|
|
],
|
|
)
|
|
finally:
|
|
db_session.close()
|