From cbb59adba8433da757858773de51e34575d364bc Mon Sep 17 00:00:00 2001 From: Rob Hedgpeth Date: Wed, 18 Dec 2019 15:19:07 -0800 Subject: [PATCH] Added data scripts, updated readme --- Flights/README.md | 84 +++++-------------------------- Flights/data/create_flights_db.sh | 7 +++ Flights/data/get_flight_data.sh | 19 +++++++ Flights/data/load_flights_data.sh | 25 +++++++++ 4 files changed, 63 insertions(+), 72 deletions(-) create mode 100644 Flights/data/create_flights_db.sh create mode 100644 Flights/data/get_flight_data.sh create mode 100644 Flights/data/load_flights_data.sh diff --git a/Flights/README.md b/Flights/README.md index 866dcb6..4650c92 100644 --- a/Flights/README.md +++ b/Flights/README.md @@ -11,10 +11,10 @@ This `README` will walk you through the steps for getting this app up and runnin # Table of Contents 1. [Getting started with MariaDB](#overview) 1. [The Basics](#intro-mariadb) - 2. [Downloadng and installing MariaDB ColumnStore](#installation) - 3. [Using the MariaDB columnar database](#mariadb-columnar) + 2. [Downloading and installing MariaDB ColumnStore](#installation) 2. [Requirements](#requirements) 3. [Getting started with the app](#getting-started) + 1. [Get the data, create the schema, and load the data](#data) 1. [Grab the code](#grab-code) 2. [Build the code](#build-code) 3. [Run the app](#run-app) @@ -30,75 +30,10 @@ This `README` will walk you through the steps for getting this app up and runnin

-### Downloadng and installing MariaDB ColumnStore +### Downloading and installing MariaDB ColumnStore [MariaDB ColumnStore](https://mariadb.com/docs/features/mariadb-columnstore/) extends [MariaDB Server](https://mariadb.com/products/) with distributed storage and massively parallel processing to support scalable, high-performance analytics. It can be deployed as the analytics component of MariaDB Platform using MariaDB MaxScale for change-data-capture and hybrid transactional/analytical query routing, or as a standalone columnar database for interactive, ad hoc analytics at scale. You can find more information on how to download and install ColumnStore [here](https://mariadb.com/downloads/#mariadb_platform-mariadb_columnstore). -### Using the MariaDB columnar database - -[MariaDB ColumnStore](https://mariadb.com/docs/features/mariadb-columnstore/) provides distributed, columnar storage for scalable analytical processing. MariaDB ColumnStore is a component of MariaDB Platform. The primary documentation is located in the [MariaDB Public Knowledge Base](https://mariadb.com/kb/en/library/mariadb-columnstore/). - -This application uses three a tables (`airlines`, `airports`, `flights`) within a single MariaDB ColumnStore database. - -```sql -CREATE TABLE `airlines` ( - `iata_code` char(2) DEFAULT NULL, - `airline` varchar(30) DEFAULT NULL -) ENGINE=Columnstore DEFAULT CHARSET=utf8; -``` - -```sql -CREATE TABLE `airports` ( - `iata_code` char(3) DEFAULT NULL, - `airport` varchar(80) DEFAULT NULL, - `city` varchar(30) DEFAULT NULL, - `state` char(2) DEFAULT NULL, - `country` varchar(30) DEFAULT NULL, - `latitude` float DEFAULT NULL, - `longitude` float DEFAULT NULL -) ENGINE=Columnstore DEFAULT CHARSET=utf8; -``` - -```sql -CREATE TABLE `flights` ( - `year` smallint(6) DEFAULT NULL, - `month` tinyint(4) DEFAULT NULL, - `day` tinyint(4) DEFAULT NULL, - `day_of_week` tinyint(4) DEFAULT NULL, - `fl_date` date DEFAULT NULL, - `carrier` char(2) DEFAULT NULL, - `tail_num` char(6) DEFAULT NULL, - `fl_num` smallint(6) DEFAULT NULL, - `origin` varchar(5) DEFAULT NULL, - `dest` varchar(5) DEFAULT NULL, - `crs_dep_time` char(4) DEFAULT NULL, - `dep_time` char(4) DEFAULT NULL, - `dep_delay` smallint(6) DEFAULT NULL, - `taxi_out` smallint(6) DEFAULT NULL, - `wheels_off` char(4) DEFAULT NULL, - `wheels_on` char(4) DEFAULT NULL, - `taxi_in` smallint(6) DEFAULT NULL, - `crs_arr_time` char(4) DEFAULT NULL, - `arr_time` char(4) DEFAULT NULL, - `arr_delay` smallint(6) DEFAULT NULL, - `cancelled` smallint(6) DEFAULT NULL, - `cancellation_code` smallint(6) DEFAULT NULL, - `diverted` smallint(6) DEFAULT NULL, - `crs_elapsed_time` smallint(6) DEFAULT NULL, - `actual_elapsed_time` smallint(6) DEFAULT NULL, - `air_time` smallint(6) DEFAULT NULL, - `distance` smallint(6) DEFAULT NULL, - `carrier_delay` smallint(6) DEFAULT NULL, - `weather_delay` smallint(6) DEFAULT NULL, - `nas_delay` smallint(6) DEFAULT NULL, - `security_delay` smallint(6) DEFAULT NULL, - `late_aircraft_delay` smallint(6) DEFAULT NULL -) ENGINE=Columnstore DEFAULT CHARSET=utf8; -``` - -For more information about MariaDB ColumnStore databases please check out the [MariaDB blog](https://mariadb.com/search-results/?q=columnstore)! - - ## Requirements This project assumes you have familiarity with building web applications using ReactJS and NodeJS technologies. @@ -111,12 +46,15 @@ This project assumes you have familiarity with building web applications using R ## Getting started -In order to build and run the application you will need to have NodeJS installed. You can find more information [here](https://nodejs.org/). +### Get the data, create the schema, and load the data -### [Create the schema and load the dataset](https://github.com/mariadb-corporation/mariadb-columnstore-samples/tree/master/flights) +Instructions on retrieving and importing the flights dataset into a MariaDB ColumnStore database can be [here](https://github.com/mariadb-corporation/mariadb-columnstore-samples/tree/master/flights). Please note that he scripts provided within that repository only targets data for the year 2019 (~7.5 million records). -This application uses data from the United States Department of Transportation that is imported into a MariaDB ColumnStore database. For instructions on how to retrieve the dataset and import it into a MariaDB ColumnStore database please see the instructions [here](https://github.com/mariadb-corporation/mariadb-columnstore-samples/tree/master/flights) provided by [Todd Stoffel](https://github.com/toddstoffel). +If you'd like to retrieve data spanning from 1990 to 2019 (~180 million records) please use the following scripts: +* [get_flight_data.sh](/data/get_flight_data.sh) +* [create_flights_db.sh](/data/create_flights_db.sh) +* [load_flights_data.sh](/data/load_flights_data.sh) ### Grab the code @@ -124,7 +62,7 @@ Download this code directly or use [git](git-scm.org) (through CLI or a client) ### Configure the code -Update the MariaDB connection configuration [here](src/db.js). +Update the MariaDB connection configuration [here](src/db.js) to point to **your** ColumnStore instance of MariaDB. ```js const pool = mariadb.createPool({ @@ -139,6 +77,8 @@ const pool = mariadb.createPool({ ### Build the code +**Important:** In order to build and run the application you will need to have NodeJS installed. You can find more information [here](https://nodejs.org/). + Once you have retrieved a copy of the code you're ready to build and run the project! However, before running the code it's important to point out that the application uses several Node Packages. For the client-side: diff --git a/Flights/data/create_flights_db.sh b/Flights/data/create_flights_db.sh new file mode 100644 index 0000000..27e8051 --- /dev/null +++ b/Flights/data/create_flights_db.sh @@ -0,0 +1,7 @@ +#!/bin/bash +SCHEMA_DIR=$(readlink -f ./schema) +# create flights database (dropping if exists) with 3 columnstore tables: flights, airports, airlines +/usr/bin/mysql --defaults-file=/etc/my.cnf -u root -vvv < $SCHEMA_DIR/schema.sql +# load data into dimension tables airports and airlines. +/usr/bin/cpimport -m 2 -s ',' -E '"' flights airports -l $SCHEMA_DIR/airports.csv +/usr/bin/cpimport -m 2 -s ',' -E '"' flights airlines -l $SCHEMA_DIR/airlines.csv \ No newline at end of file diff --git a/Flights/data/get_flight_data.sh b/Flights/data/get_flight_data.sh new file mode 100644 index 0000000..9ae2cdf --- /dev/null +++ b/Flights/data/get_flight_data.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# +# This script will remotely invoke the bureau of transportation statistics web form to retrieve data by month: +# https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time +# for the specific columns listed in the SQL and utilized by the sample schema. +mkdir -p data +for y in {1990..2019}; do + for m in {1..12}; do + yyyymm="$y-$(printf %02d $m)" + echo "$yyyymm" + curl -L -o data.zip -d "sqlstr=+SELECT+YEAR%2CMONTH%2CDAY_OF_MONTH%2CDAY_OF_WEEK%2CFL_DATE%2CCARRIER%2CTAIL_NUM%2CFL_NUM%2CORIGIN%2CDEST%2CCRS_DEP_TIME%2CDEP_TIME%2CDEP_DELAY%2CTAXI_OUT%2CWHEELS_OFF%2CWHEELS_ON%2CTAXI_IN%2CCRS_ARR_TIME%2CARR_TIME%2CARR_DELAY%2CCANCELLED%2CCANCELLATION_CODE%2CDIVERTED%2CCRS_ELAPSED_TIME%2CACTUAL_ELAPSED_TIME%2CAIR_TIME%2CDISTANCE%2CCARRIER_DELAY%2CWEATHER_DELAY%2CNAS_DELAY%2CSECURITY_DELAY%2CLATE_AIRCRAFT_DELAY+FROM++T_ONTIME+WHERE+Month+%3D$m+AND+YEAR%3D$y" https://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=236 + rm -f *.csv + unzip data.zip + rm -f data.zip + mv *.csv $yyyymm.csv + tail -n +2 $yyyymm.csv > data/$yyyymm.csv + rm -f $yyyymm.csv + done +done \ No newline at end of file diff --git a/Flights/data/load_flights_data.sh b/Flights/data/load_flights_data.sh new file mode 100644 index 0000000..2ccd8c6 --- /dev/null +++ b/Flights/data/load_flights_data.sh @@ -0,0 +1,25 @@ +#!/bin/bash +SCHEMA_DIR=$(readlink -f ./schema) +# create flights database (dropping if exists) with 3 columnstore tables: flights, airports, airlines +/usr/bin/mysql --defaults-file=/etc/my.cnf -u root -vvv < $SCHEMA_DIR/schema.sql +# load data into dimension tables airports and airlines. +/usr/bin/cpimport -m 2 -s ',' -E '"' flights airports -l $SCHEMA_DIR/airports.csv +/usr/bin/cpimport -m 2 -s ',' -E '"' flights airlines -l $SCHEMA_DIR/airlines.csv +[root@flight-demo-mdb-cs-single-0 data]# cat +load^C +[root@flight-demo-mdb-cs-single-0 data]# cat load_flight_data.sh +#!/bin/bash +# check for argument, if so use as wildcard for file load match, otherwise load everything +DATA_DIR=$(readlink -f ./data) +filematch="*" +if [ $# -eq 1 ] +then + filematch="*$1*" +fi +# load the specified files under the data directory with the file pattern match +# here we use cpimport mode 2 to force processing at each PM node which has +# the advantage of this being runnable as a regular user with a root installation. +for f in $DATA_DIR/$filematch.csv; do + echo $f + /usr/bin/cpimport -m2 -s ',' -E '"' flights flights -l $f +done \ No newline at end of file