-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added R script for Churn Dataset and a simple Python Script for 50_st…
…artups
- Loading branch information
1 parent
a3691d4
commit f0f67a9
Showing
2 changed files
with
307 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import numpy as np # linear algebra | ||
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from sklearn.linear_model import LinearRegression | ||
|
||
from sklearn.compose import ColumnTransformer | ||
from sklearn.preprocessing import OneHotEncoder | ||
from sklearn.metrics import accuracy_score, r2_score | ||
from sklearn.model_selection import cross_val_score | ||
from sklearn.model_selection import train_test_split | ||
import seaborn as sns | ||
|
||
sns.set_style(style='darkgrid') | ||
|
||
# reading the dataset | ||
dataset = pd.read_csv('../50_Startups.csv') | ||
x = dataset.iloc[:,:-1].values | ||
y = dataset.iloc[:,-1].values | ||
|
||
# Using One-Hot Encoder | ||
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough') | ||
x = np.array(ct.fit_transform(x)) | ||
|
||
# splitting the data into Training & Test set | ||
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=0) | ||
|
||
# Linear Regression | ||
lr=LinearRegression() | ||
lr.fit(x_train,y_train) | ||
|
||
# prediction | ||
y_pred = lr.predict(x_test) | ||
np.set_printoptions(precision=2) | ||
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1)) | ||
|
||
print(lr.coef_) | ||
print(lr.intercept_) |
267 changes: 267 additions & 0 deletions
267
Lashuk1729_Different_ML_Algorithms/basic_data_analysis.r
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
#libraries installation | ||
install.packages('pastecs') | ||
install.packages('readr') | ||
install.packages('Jmisc') | ||
install.packages('ggplot2') | ||
install.packages('plotly') | ||
install.packages('corrplot') | ||
install.packages('caret') | ||
install.packages('e1071') | ||
install.packages('gbm') | ||
install.packages("klaR", dependencies = TRUE) | ||
install.packages("car", dependencies = TRUE) | ||
install.packages("mctest") | ||
install.packages("GGally") | ||
install.packages("caTools") | ||
install.packages('mlbench') | ||
rpart.plot(deciTreeMod, extra = 106) | ||
|
||
|
||
# libraries used | ||
# for basic data summary | ||
library(pastecs) | ||
library(readr) | ||
|
||
# for visualization | ||
library(ggplot2) | ||
library(plotly) | ||
library(corrplot) | ||
library(caTools) | ||
|
||
# for checking multicollinearity | ||
library(mctest) | ||
library(car) | ||
library(GGally) | ||
|
||
# for applying machine learning algorithms | ||
# for regression & classification | ||
library(caret) | ||
# for svm | ||
library(e1071) | ||
# gradient boast | ||
library(gbm) | ||
# for classification and visualization(knn, k-modes) | ||
library(klaR) | ||
# for decision tree and plotting decision tree | ||
library(rpart) | ||
library(rpart.plot) | ||
# for benchmarking | ||
library(mlbench) | ||
|
||
# reading and previewing first 5 rows of the dataset | ||
df <- read.csv("../Churn_Modelling.csv") | ||
head(df, 5) | ||
|
||
# checking the information of dataset and number of missing value on each column | ||
str(df) | ||
sapply(df, function(x) sum(is.na(x))) | ||
|
||
# checking the unique values for each column | ||
sapply(df, function(x) length(unique(x))) | ||
|
||
# removing first 3 coulumns(features) as they are specific to the customers | ||
df <- subset(df, select = -c(RowNumber, CustomerId, Surname)) | ||
dim(df) | ||
head(df, 2) | ||
|
||
# Understanding the y feature of the dataset | ||
# bar-graph | ||
ggplot(df, aes(x = Exited)) + | ||
geom_histogram(binwidth = 1, fill = c("blue","Dark Red")) + | ||
xlab("Exited") + | ||
ylab("Frequency") + | ||
ggtitle("Individual who Exited") | ||
|
||
# pie-chart | ||
slices <- table(df$Exited) | ||
pct <- round(slices/sum(slices)*100) | ||
lbls<-paste(names(slices),pct,sep=" or ") | ||
lbls<-paste(lbls,"%") | ||
pie(slices, labels=lbls, col=terrain.colors(length(lbls)), | ||
main="Pie Chart of Exited") | ||
|
||
# Insight:- | ||
# only 20% of the customers have currently exited and the goal to model | ||
# this feature, more exploration on the data to see the corelation between | ||
# some predictors and the outcome variables | ||
|
||
# Visualizing how the exited individuals based on tenure | ||
ggplot(df, aes(x = Exited)) + | ||
geom_histogram(aes(x = Tenure, | ||
fill = Exited), binwidth = 1) + | ||
geom_vline(aes(xintercept=mean(Tenure))) + | ||
facet_grid( ~ Exited) | ||
|
||
# Insight:- | ||
# From the histogram, we can see that average tenure value is around 5 years, | ||
# for both exited and not exited individual. | ||
|
||
# Visualizing how the exited individuals based on gender | ||
ggplot(df,aes(x=Gender, y=Exited)) + | ||
geom_point(position=position_jitter(0.3)) | ||
|
||
# Insight:- | ||
# Considering the plot, we can easily see that Female individuals have exited | ||
# more than Male individuals | ||
|
||
# Visualizing how the exited individuals based on Geography | ||
ggplot(df,aes(x=Exited, fill=Geography)) + | ||
geom_density(col=NA,alpha=0.25) | ||
|
||
# Insight:- | ||
# From the visualization, we can say that the individuals living in Germany | ||
# exited more than those in living in France and Spain. | ||
|
||
# Visualizing the correlation between the numerical variables | ||
p1 <- df[,-which(names(df) == "Geography")] | ||
p2 <- p1[,-which(names(p1) == "Gender")] | ||
corr <- cor(p2) | ||
d <- corrplot(corr) | ||
|
||
# Converting Categorical Variables to Numerical and Plotting the new correlation | ||
# Converting Categorical Variable (Gender) to Numerical | ||
df$Gender <- as.factor(df$Gender) | ||
df$Gender <- as.numeric(df$Gender) - 1 | ||
is.numeric(df$Gender) | ||
|
||
# Converting Categorical Variable (Geography) to Numerical | ||
df$Geography <- as.factor(df$Geography) | ||
df$Geography <- as.numeric(df$Geography) - 1 | ||
is.numeric(df$Geography) | ||
|
||
corr <- cor(df) | ||
d <- corrplot(corr, method="color", order="hclust") | ||
|
||
# Insight:- | ||
# From the correlation plot, we can extract the following insights:- | ||
# - Age and Balance has some positive correlation with Exited. | ||
# - Gender and IsActiveMember has some negative correlation with Exited. | ||
|
||
|
||
# After removing and feature engineering, we have the final summary of the | ||
# provided dataset | ||
summary(df) | ||
str(df) | ||
|
||
# splitting the data into Training & Test set | ||
dataset <- createDataPartition(df$Exited, p = 4/5, list = FALSE) | ||
df_train <- df[dataset,] | ||
df_test <- df[-dataset,] | ||
|
||
dim(df_train) | ||
dim(df_test) | ||
|
||
# Feature Scaling | ||
# df_train[c(1,4,5,6,7,10)] <- lapply(df_train[c(1,4,5,6,10)], function(x) c(scale(x))) | ||
# df_test[c(1,4,5,6,7,10)] <- lapply(df_test[c(1,4,5,6,10)], function(x) c(scale(x))) | ||
|
||
# Build the logistic regression model | ||
logitMod <- glm(Exited ~ ., data=df_train, family=binomial(link="logit")) | ||
summary(logitMod) | ||
# Checking Multicollinearity | ||
vif(logitMod) | ||
plot(logitMod) | ||
|
||
# Insight: | ||
# Considering the rule of thumb, if VIF is: | ||
# 1) 1 = not correlated. | ||
# 2) Between 1 and 5 = moderately correlated. | ||
# We can safely say that the features are not correlated. | ||
|
||
# Data Preparation | ||
# Feature Selection: considering relevant and important attributes in your data | ||
# Ranking the features based on importance | ||
set.seed(100) | ||
control <- trainControl(method="repeatedcv", number=10, repeats=3) | ||
# train the model | ||
model <- train(Exited~., data=df_train, method="rpart", trControl=control) | ||
# estimate variable importance | ||
importance <- varImp(model, scale=FALSE) | ||
# summarize importance | ||
print(importance) | ||
# plot importance | ||
plot(importance) | ||
|
||
# Insight:- | ||
# Based on the graph, we have see that:- | ||
# - Features like NumOfProducts, Age, IsActive Member, Balance, Gender and | ||
# Geography are first 6 important feature | ||
# - Other features like EstimatedSalary, CreditScore can be neglected. | ||
|
||
################################################################################ | ||
|
||
# Using the above Logistic Regression Model | ||
logitMod <- glm(Exited ~ NumOfProducts + Age + IsActiveMember + Balance + | ||
Gender + Geography, data=df_train, family=binomial) | ||
|
||
logPredict <- predict(logitMod, df_test, type="response") | ||
|
||
# ROC Curve | ||
model_AUC <- colAUC(logPredict, df_test$Exited, plotROC = T) | ||
abline(h = model_AUC, col = "Red") | ||
text(.2, .9, cex = .8, labels = paste("Optimal Cutoff:", round(model_AUC,4))) | ||
|
||
# Cutoff from the ROC Curve = 0.7319 | ||
result_class <- ifelse(logPredict > 0.7319, 1, 0) | ||
|
||
# Changing the Exited to binary feature | ||
result_class <- factor(result_class) | ||
actual_class <- factor(df_test$Exited) | ||
|
||
# Confusion Matrix | ||
confusionMatrix(result_class, actual_class, mode = "prec_recall", positive="1") | ||
|
||
# Insight:- | ||
# The accuracy for the logistic regression model is 79.5% | ||
# From the confusion Matrix, we can see that:- | ||
# 1. Precision : 0.70588 | ||
# 2. Recall : 0.02878 | ||
# 3. F1 : 0.05530 | ||
|
||
################################################################################ | ||
|
||
# Using Decision Tree | ||
deciTreeMod <- rpart(Exited ~ NumOfProducts + Age + IsActiveMember + Balance + | ||
Gender + Geography, data=df_train, method = 'class') | ||
|
||
rpart.plot(deciTreeMod, extra = 106) | ||
|
||
decitreePredict <- predict(deciTreeMod, df_test, type = "class") | ||
|
||
levels(decitreePredict) | ||
actual_class <- factor(df_test$Exited) | ||
|
||
# Confusion Matrix | ||
confusionMatrix(decitreePredict, actual_class, mode = "prec_recall", positive="1") | ||
|
||
# Insight:- | ||
# The accuracy for the logistic regression model is 84.1% | ||
# From the confusion Matrix, we can see that:- | ||
# 1. Precision : 0.7143 | ||
# 2. Recall : 0.3957 | ||
# 3. F1 : 0.5093 | ||
|
||
################################################################################ | ||
|
||
# Model Comparison | ||
# The main focus is to compare models based on the accuracy. The best model is | ||
# Decision Tree with 84.1%, followed by Logistic Regression with 79.1%. | ||
# I should also consider other algorithms as well for example: Random Forest, | ||
# Naive Bayes and SVC. | ||
|
||
################################################################################ | ||
|
||
# Conclusion | ||
# - From the descriptive analysis performed, Female individual exited more | ||
# than Male and Customers located in Germany churn more comparing to | ||
# other locations. | ||
# - With the Prediction made by the models, Decision Tree seems to classify | ||
# compared to other model with the accuracy of 84.1%. | ||
# - This accuracy can further be improved by using different algorithms and if | ||
# we are able to obtain additional information for the analysis so that we | ||
# can improve the feature engineering process. | ||
# - My personal thought is we should concentrate on the individuals who are | ||
# exited rather than retaining the new individuals. It's always easier to | ||
# retain the individual who are with us than obtaining new ones. | ||
|
||
################################################################################ |