-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathshrinking_air_dataset.R
68 lines (48 loc) · 2.37 KB
/
shrinking_air_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Project: airline cancellations and delays
# Author: Mohsen Zardadi
# January 2019
##################################################################################################
##################################################################################################
# The data in air.csv relate to airline cancellations and delays for the year 2008.
# You can download the air.csv dataset from
# http://rtricks4kids.ok.ubc.ca/wjbraun/DS550/air.csv.
# It is too large to fit on the regular github repository.
# In order to read the data from air.csv into R, you will need to type
#air <- read.csv("air.csv")
#summary(air)
#head(air)
############################################# data wrangling ##############################################################
###########################################################################################################################
library(dplyr)
# Exploring air data set:
summary(air)
# replace all the empty vlaues with 'Null'
air[air==""] <- NA
# checking null values
sapply(air,function(x) sum(is.na(x)))
# missing values visualizing.
#library(Amelia)
#missmap(air, main = "Missing values vs observed")
# drop columns which are non-relevent or have the most missing values
drop <- c('DepTime', 'ArrTime', 'ActualElapsedTime', 'CRSElapsedTime','AirTime',
'TaxiIn','TaxiOut','LateAircraftDelay', 'SecurityDelay', 'NASDelay',
'WeatherDelay', 'CarrierDelay','TailNum', 'CancellationCode', 'CRSDepTime', 'FlightNum', 'Distance', 'Cancelled',
'Diverted','CRSArrTime')
airdf <- air[,!(names(air)%in%drop)]
# checking missing values in the sub-set
sapply(airdf, function(x) sum(is.na(x)))
# removing all rows with null values.
# (we may replace null values with some specific values)
airdf <- na.omit(airdf)
# checking null values
sapply(airdf, function(x) sum(is.na(x)))
# no more null values, Hooray!
# explor the data
sapply(airdf, function(x) length(unique(x)))
################################################################################################################################
###############################################################################################################################
# Now we can save new data set for shiny app.
save(airdf,file = 'airdf.RData')
# We may need subset of one specific airport
ord <- subset(airdf, Dest =="ORD")
save(ord,file = "ord.RData")