-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflight_price_model.py
479 lines (355 loc) · 16 KB
/
flight_price_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
# -*- coding: utf-8 -*-
"""Flight Price Model
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1A1R3wu5mfMvGyN7snWIeS6ynOJlHfHZ-
# Flight Price
---
This model will predict flight prices based on [this](https://www.kaggle.com/zernach/2018-airplane-flights) data.
This model is being made for [mia](http://miamarketplace.com/), a place for people to run machine learning models interactively on the web.
"""
#@title Require libraries import cell
"""
required libraries imported
"""
! pip install category_encoders
import category_encoders as ce
from __future__ import print_function
import os, io, sys, random, time, pprint
import numpy as np
from numpy import save, load
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import one_hot
from tensorflow.keras.callbacks import LambdaCallback, Callback, ModelCheckpoint
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.models import load_model
from tensorflow.keras import Model, Sequential
from tensorflow.keras import backend
from tensorflow.keras.layers import Activation, Reshape, Dense, Embedding, Dropout, Input, LayerNormalization, BatchNormalization, concatenate, Flatten, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adadelta, Adagrad, Adamax
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.utils import to_categorical, plot_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# SOME CONSTANTS
checkpoint_dir = '/content/drive/My Drive/MIA/best_model.hdf5'
model_preds_train = '/content/drive/My Drive/MIA/train_preds.npy'
model_preds_test = '/content/drive/My Drive/MIA/test_preds.npy'
truth_train = '/content/drive/My Drive/MIA/truth_train.npy'
truth_test = '/content/drive/My Drive/MIA/truth_test.npy'
train_continuous_dir = '/content/drive/My Drive/MIA/train_continuous_dir.npy'
test_continuous_dir = '/content/drive/My Drive/MIA/test_continuous_dir.npy'
train_categorical_dir = '/content/drive/My Drive/MIA/train_categorical_dir.npy'
test_categorical_dir = '/content/drive/My Drive/MIA/test_categorical_dir.npy'
"""
testing if connected to TPU and/or GPU
"""
if 'COLAB_TPU_ADDR' not in os.environ:
print('Not connected to a TPU runtime.')
else:
tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print ('Connected to TPU.\n\nTPU address is', tpu_address)
with tf.compat.v1.Session((tpu_address)) as session:
devices = session.list_devices()
print('TPU devices:')
pprint.pprint(devices)
if tf.test.gpu_device_name() == '':
print('\n\nNot connected to a GPU runtime.')
else:
print('\n\nConnected to GPU: ' + tf.test.gpu_device_name())
"""
need to mount the drive to access the data
"""
from google.colab import drive
drive.mount('/content/drive')
"""# Assumptions about the data
## 1. ItinID and MktID are ambiguous columns that do not seem to be significant towards predicting the flight price. The order in which the tickets are bought should not, in an obvious manner, affect how much an airline will charge for a flight. ticket. Ergo, these columns will be deleted.
## 2. The OriginWac and DestWac columns are useless too. They are just arbirary US State/Territory World Area Code that will not affect the flight price. They shall be removed.
## 3. Several columns are categorical. In order to deal with categorical and contnuous data at the same time, we will use [this](https://datascience.stackexchange.com/questions/29634/how-to-combine-categorical-and-continuous-input-features-for-neural-network-trai) technique. The categorical inputs will be run through an Embedding layer and a small neural network, and then their outputs will be concantenated as inputs with the continuous data.
## 4. The categorical data columns are: Origin, Dest, and AirlineCompany.
## 5. ContiguousUSA is already a numerical variable with the values being [2, 1]. It does not need to be converted to numerical values, instead just needs to be normalized.
"""
"""
reading data
"""
path = "/content/drive/My Drive/MIA/Cleaned_2018_Flights.csv"
df = pd.read_csv(path)
df
"""
scaling the target Y price values
"""
bool_scale = False
if bool_scale:
df['PricePerTicket'] = df['PricePerTicket'] / 1000.0 # scaling the target
df
"""
removing the first three unnecessary columns
removing OriginWac and DestWac because they are useless
"""
df.drop(['Unnamed: 0', 'ItinID', 'MktID', 'OriginWac', 'DestWac'], axis=1, inplace=True) # dropping index_col, ItinID, and MktID, OriginWac, DestWac
df
"""## Applying the different types of encodings for all the variables."""
"""
different types of encodings for all the variables
Miles, NumTicketsOrdered: continuous, no encoding
MktCoupons, ContiguousUSA: one hot encoding
Quarter: ordinal encoding (doesn't need to be applied, the variable is already numerically ordinal)
Origin, Dest, AirlineCompany: mean/target encoding
******************THIS IS NOT DONE RIGHT NOW******************
"""
# making the target variable the last one
temp_col = df['PricePerTicket']
df.drop('PricePerTicket', axis=1, inplace=True)
df['PricePerTicket'] = temp_col
# placing the two continuous variables in the beginning
tickets_col = df.pop('NumTicketsOrdered')
df.insert(0, 'NumTicketsOrdered', tickets_col)
miles_col = df.pop('Miles')
df.insert(0, 'Miles', miles_col)
# normalizing Quarter, Miles, and NumTicketsOrdered variables================================================
bool_norm = True
norm_vars = ['Miles', 'NumTicketsOrdered']
if bool_norm:
for col in norm_vars:
df[col] = df[col] / df[col].max()
# ===========================================================================================================
df
"""## First we separate the X and Y into training and testing datasets."""
"""
shuffling the data before separating into X and Y
configuring training and testing datasets
"""
data = df.to_numpy()
print(f"The shape of the data is: {data.shape}")
train, test = train_test_split(data, test_size=0.1, shuffle=True)
X_train, Y_train = train[::1, :-1], train[::1, -1]
X_test, Y_test = test[::1, :-1], test[::1, -1]
print(f"\nTraining shape: {X_train.shape}")
print(f"\nTesting shape: {X_test.shape}")
"""
preparing the inputs to the model by parsing the X_train, X_test variables
this implementation is coming from an article
EMBEDDING LAYERS FOR ALL 6 CATEGORICAL VARIABLES
"""
def parse_vars(var_data):
temp_cont = var_data[:, 0:2]
def categorical_func(cate_data):
temp = list()
for i in range(cate_data.shape[1]):
le = LabelEncoder()
le.fit(cate_data[:, i])
label_encoded_feature = le.transform(cate_data[:, i])
temp.append(label_encoded_feature)
return temp
temp_cate = categorical_func(var_data[:, 2:])
return (temp_cont, temp_cate)
train_continuous, train_categorical = parse_vars(X_train)
test_continuous, test_categorical = parse_vars(X_test)
print(f"train_continuous.shape: {train_continuous.shape}")
print(f"\ntest_continuous.shape: {test_continuous.shape}")
"""## Now we will start making the dense neural network for the model."""
#@title Initiating loss for training and validation, creating callback functions for stopping training and saving the model periodically.
"""
keras callback
"""
# loss lists so that training need not be completed to plot graphs
training_mse = []
validation_mse = []
class callback(Callback):
def on_epoch_end(self, epoch, logs={}):
if logs.get('mse') <= 0.01:
print(f'\nTraining halted here. Model has achieved {round(logs.get("mse"), 2)} loss on training set.')
self.model.stop_training = True
training_mse.append(logs.get('mse'))
validation_mse.append(logs.get('val_mse'))
call = ModelCheckpoint(checkpoint_dir, monitor='mse', verbose=1,
save_best_only=True, mode='auto', save_freq='epoch')
training_stop = callback()
"""
creating the keras model
"""
cont_input = Input(shape=(train_continuous.shape[-1],), name='continuous_input')
cont = BatchNormalization(name='continuous_batchnorm_1')(cont_input)
# ========================================================================================================
continuous_dense = False
if continuous_dense:
for i, unit in enumerate([16], start=2):
cont = Dense(units=unit, activation='relu', name=f'dense_cont_{i}')(cont)
cont = BatchNormalization(name=f'continuous_batchnorm_{i}')(cont)
# ========================================================================================================
in_layers = list()
em_layers = list()
for i in range(len(train_categorical)):
n_labels = len(np.unique(train_categorical[i])) # unique values in each categorical column
in_layer = Input(shape=(1,), name=f'categorical_layer_{i+1}')
emb_shape = min(50, (n_labels + 1) // 2)
em_layer = Embedding(n_labels+1, emb_shape, name=f'embedding_layer_{i+1}')(in_layer)
em_layer = Reshape(target_shape=(emb_shape,), name=f'categorical_reshape_layer_{i+1}')(em_layer)
em_layer = Dropout(rate=0.4, name=f'categorical_dropout_layer_{i+1}')(em_layer)
in_layers.append(in_layer) # categorical variables
em_layers.append(em_layer)
cate = concatenate(inputs=em_layers, name='merge_categorical')
# ========================================================================================================
categorical_dense = False
if categorical_dense:
for i, unit in enumerate([16, 16], start=1):
cate = Dense(units=unit, activation='relu', name=f'dense_cate_{i}')(cate)
# ========================================================================================================
merged = concatenate(inputs=[cont, cate], name='merge_all')
for i, unit in enumerate([100, 100], start=1):
merged = Dense(units=unit, activation='relu', name=f'merged_dense_{i}')(merged)
# merged = BatchNormalization(name=f'merged_batchnorm_{i}')(merged)
merged = Dropout(rate=0.4, name=f'merged_dropout_{i}')(merged)
out = Dense(units=1, activation='linear', name='output_layer')(merged)
# creating the Model
model = Model(inputs=[cont_input, in_layers], outputs=out, name='FP_Model')
# saving the model graph and seeing the architecture
plot_model(model, show_shapes=True, to_file='/content/drive/My Drive/MIA/nn_graph.png')
model.summary()
"""
compiling and fitting
"""
optimizer = Adam(learning_rate=0.1)
model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])
# model will begin training with previously trained weights
try:
model.load_weights(checkpoint_dir)
except:
print('Model architecture has been changed. No weights loaded\n\n')
# consistency with data types; train_categorical is a list of np.ndarray
train_continuous = np.asarray(train_continuous, dtype=np.float32)
test_continuous = np.asarray(test_continuous, dtype=np.float32)
Y_train = np.asarray(Y_train, dtype=np.float32)
history = model.fit(
[train_continuous, train_categorical],
Y_train,
batch_size=512,
epochs=500,
validation_split=0.2,
callbacks=[call, training_stop]
)
"""
plotting losses
"""
width_in_inches = 30
height_in_inches = 10
dots_per_inch = 50
plt.figure(
figsize=(width_in_inches, height_in_inches),
dpi=dots_per_inch)
plt.rcParams['axes.facecolor'] = '#3D383D'
plt.rcParams['legend.facecolor'] = 'white'
plt.plot(training_mse, 'o--b', label='training loss', mew=7, linewidth=3)
plt.plot(validation_mse, 'o--r', label='validation loss', mew=7, linewidth=3)
plt.legend(loc="upper right", fontsize=25)
plt.xlabel('Epoch', fontsize=38, color='white')
plt.ylabel('MSE', fontsize=38, color='white')
plt.xticks(range(1, len(training_mse)), fontsize=17, color='white')
plt.yticks(fontsize=17, color='white')
plt.grid(color='grey', linestyle=':', linewidth=1.7)
plt.show()
plt.close()
"""
loading the best trained model (can jump directly here if training is not required)
"""
model = load_model(checkpoint_dir)
"""## Now that we have a trained model with its losses plotted, we can test it on example flights.
---
## In order to avoid having colab crash the runtime because the numpy arrays are in magnitudes of millions (the subtract function is the culprit), we will save the arrays using numpy format on drive, restart the runtime, and then retrieve them for the computation. This is because upon testing it is observed that the runtime is able to perform the computation on large arrays if that is the only computation run on it.
"""
"""
saving the required data onto drive for later use
"""
# these go here for model predicting purposes =======
ex_train = [train_continuous, train_categorical]
ex_test = [test_continuous, test_categorical]
# ===================================================
# consistency with data types
train_continuous = np.asarray(train_continuous, dtype=np.float32)
test_continuous = np.asarray(test_continuous, dtype=np.float32)
# ======================================================================
# saving model predictions on training and test sets
preds_train = model.predict(ex_train, verbose=0)
preds_test = model.predict(ex_test, verbose=0)
save(model_preds_train, preds_train)
save(model_preds_test, preds_test)
# saving the true Y values
save(truth_train, Y_train)
save(truth_test, Y_test)
# saving the continuous training and test variables
save(train_continuous_dir, train_continuous)
save(test_continuous_dir, test_continuous)
# saving the categorical training and test variables
save(train_categorical_dir, train_categorical)
save(test_categorical_dir, test_categorical)
"""
loading the saved data from drive
"""
# loading model predictions on training and test sets
preds_train = load(model_preds_train)
preds_test = load(model_preds_test)
# loading the true Y values
Y_train = load(truth_train, allow_pickle=True)
Y_test = load(truth_test, allow_pickle=True)
# loading the continuous training and test variables
train_continuous = load(train_continuous_dir)
test_continuous = load(test_continuous_dir)
# loading the categorical training and test variables
train_categorical = load(train_categorical_dir)
test_categorical = load(test_categorical_dir)
#@title enlarge(feature) util function implemented
"""
important util function to modify the input model shapes
"""
def enlarge(feature):
if len(feature.shape) == 0:
return enlarge(np.expand_dims(feature, 0))
return np.expand_dims(feature, 0)
"""
testing the model on an example flight price
"""
rand_train = random.randint(0, len(Y_train))
rand_test = random.randint(0, len(Y_test))
truths_train = round(float(Y_train[rand_train]), 2)
truths_test = round(float(Y_test[rand_test]), 2)
pred_train = round(float(preds_train[rand_train]), 2)
pred_test = round(float(preds_test[rand_test]), 2)
print(f"A training example\nTrue price: ${truths_train}, model predicted: ${pred_train}")
print(f"\n\nA testing example\nTrue price: ${truths_test}, model predicted: ${pred_test}")
"""## Checking the model's accuracy on the training and test sets."""
"""
predicting an approximate accuracy of the model
if the model predicts the correct price of a flight with a +- epsilon value
then that will be a correct prediction, else incorrect
"""
epsilon = 70.0
def accuracy(spec='train'):
preds = None
if spec == 'train':
preds = np.squeeze(preds_train)
diff = np.absolute(np.subtract(preds, Y_train))
diff = diff <= epsilon
diff = diff.astype(int)
diff = diff.sum()
return round((float(diff) / Y_train.shape[0]) * 100, 2)
else:
preds = np.squeeze(preds_test)
diff = np.absolute(np.subtract(preds, Y_test))
diff = diff <= epsilon
diff = diff.astype(int)
diff = diff.sum()
return round((float(diff) / Y_test.shape[0]) * 100, 2)
print(f"Epsilon taken as ${int(epsilon)}")
print(f"Accuracy on the training set: {accuracy('train')}%")
print(f"Accuracy on the test set: {accuracy('test')}%")
"""## Once the model is satisfiably good, it will be exported as a tf model to the drive to be uploaded to [mia](http://miamarketplace.com/)."""
"""
saving the model and the files for uploading to mia
checking the tensorflow and python versions
"""
model.save('/content/drive/My Drive/MIA/', save_format='tf')
print(tf.__version__)
! python --version