2022-03-09 12:08:31 +00:00
import csv
2022-11-24 12:40:39 +00:00
import hashlib
2022-02-16 00:57:39 +00:00
import json
import logging
2022-03-09 15:51:49 +00:00
import re
2024-03-28 13:37:45 +00:00
from collections import OrderedDict
2022-02-16 16:12:42 +00:00
from io import StringIO
2022-11-24 12:40:39 +00:00
from typing import Any , Dict
2022-02-16 16:12:42 +00:00
2022-08-25 12:13:46 +00:00
from sqlalchemy . orm import sessionmaker
2023-03-08 19:09:06 +00:00
from sqlalchemy . sql import text
2023-03-09 16:41:50 +00:00
from sqlalchemy . sql . expression import TextClause
2022-11-24 12:40:39 +00:00
from . . actions import push_data_to_bucket
2023-03-09 07:28:34 +00:00
from . . db import RO_pre_ping_query_engine
from . . reporter import reporter
from . . settings import MOONSTREAM_S3_QUERIES_BUCKET_PREFIX
2022-02-16 00:57:39 +00:00
logging . basicConfig ( level = logging . INFO )
logger = logging . getLogger ( __name__ )
2023-11-30 13:11:20 +00:00
QUERY_REGEX = re . compile ( r " [ \ [ \ ]@#$ % ^&?;`]|/ \ *| \ */ " )
2022-03-09 15:51:49 +00:00
class QueryNotValid ( Exception ) :
"""
Raised when query validation not passed .
"""
2022-02-16 00:57:39 +00:00
2022-03-09 15:51:49 +00:00
def query_validation ( query : str ) - > str :
"""
Sanitize provided query .
"""
if QUERY_REGEX . search ( query ) != None :
raise QueryNotValid ( " Query contains restricted symbols " )
return query
2022-08-25 12:13:46 +00:00
def to_json_types ( value ) :
2022-11-30 15:31:19 +00:00
if isinstance ( value , ( str , int , tuple , dict , list ) ) :
2022-08-25 12:13:46 +00:00
return value
elif isinstance ( value , set ) :
return list ( value )
else :
return str ( value )
2022-11-29 13:06:27 +00:00
def from_json_types ( value ) :
if isinstance ( value , ( str , int , tuple , dict ) ) :
return value
2022-11-29 15:05:22 +00:00
elif isinstance ( value , list ) : # psycopg2 issue with list support
2022-11-29 13:06:27 +00:00
return tuple ( value )
else :
return str ( value )
2022-11-28 14:52:30 +00:00
2022-02-16 16:12:42 +00:00
def data_generate (
query_id : str ,
file_type : str ,
2022-11-28 14:52:30 +00:00
bucket : str ,
key : str ,
2023-03-09 16:41:50 +00:00
query : TextClause ,
2022-11-24 12:40:39 +00:00
params : Dict [ str , Any ] ,
2022-11-24 13:02:32 +00:00
params_hash : str ,
2022-02-16 16:12:42 +00:00
) :
2022-02-16 00:57:39 +00:00
"""
Generate query and push it to S3
"""
2022-02-16 16:12:42 +00:00
2023-03-09 07:28:34 +00:00
process_session = sessionmaker ( bind = RO_pre_ping_query_engine )
2022-08-25 12:13:46 +00:00
db_session = process_session ( )
2022-11-24 12:40:39 +00:00
metadata = {
" source " : " drone-query-generation " ,
" query_id " : query_id ,
" file_type " : file_type ,
2022-11-28 14:52:30 +00:00
" params_hash " : params_hash ,
2022-11-24 12:40:39 +00:00
" params " : json . dumps ( params ) ,
}
2022-08-25 12:13:46 +00:00
try :
2022-11-24 13:13:15 +00:00
# TODO:(Andrey) Need optimization that information is usefull but incomplete
block_number , block_timestamp = db_session . execute (
2023-03-09 16:41:50 +00:00
text (
" SELECT block_number, block_timestamp FROM polygon_labels WHERE block_number=(SELECT max(block_number) FROM polygon_labels where label= ' moonworm-alpha ' ) limit 1; "
) ,
2022-11-24 13:13:15 +00:00
) . one ( )
2022-08-25 17:28:57 +00:00
if file_type == " csv " :
csv_buffer = StringIO ( )
csv_writer = csv . writer ( csv_buffer , delimiter = " ; " )
# engine.execution_options(stream_results=True)
2023-03-09 16:41:50 +00:00
query_instance = db_session . execute ( query , params ) # type: ignore
2022-08-25 17:28:57 +00:00
2023-03-09 16:41:50 +00:00
csv_writer . writerow ( query_instance . keys ( ) )
csv_writer . writerows ( query_instance . fetchall ( ) )
2022-08-25 17:28:57 +00:00
2022-11-24 13:13:15 +00:00
metadata [ " block_number " ] = block_number
metadata [ " block_timestamp " ] = block_timestamp
2022-11-28 14:52:30 +00:00
data = csv_buffer . getvalue ( ) . encode ( " utf-8 " )
2022-08-25 17:28:57 +00:00
else :
block_number , block_timestamp = db_session . execute (
2023-03-08 19:09:06 +00:00
text (
" SELECT block_number, block_timestamp FROM polygon_labels WHERE block_number=(SELECT max(block_number) FROM polygon_labels where label= ' moonworm-alpha ' ) limit 1; "
) ,
2022-08-25 17:28:57 +00:00
) . one ( )
data = json . dumps (
{
" block_number " : block_number ,
" block_timestamp " : block_timestamp ,
" data " : [
2023-03-08 19:09:06 +00:00
{
key : to_json_types ( value )
for key , value in row . _asdict ( ) . items ( )
}
2023-03-09 16:41:50 +00:00
for row in db_session . execute ( query , params ) . all ( )
2022-08-25 17:28:57 +00:00
] ,
}
) . encode ( " utf-8 " )
2022-11-28 14:52:30 +00:00
push_data_to_bucket (
data = data ,
key = key ,
bucket = bucket ,
metadata = metadata ,
)
2022-08-25 17:28:57 +00:00
except Exception as err :
2022-11-29 15:05:22 +00:00
logger . error ( f " Error while generating data: { err } " )
2022-08-25 17:28:57 +00:00
db_session . rollback ( )
reporter . error_report (
err ,
[
" queries " ,
" execution " ,
f " query_id: { query_id } " f " file_type: { file_type } " ,
] ,
)
2022-08-25 12:13:46 +00:00
finally :
db_session . close ( )