-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCustomer Churn.R
322 lines (209 loc) · 7.47 KB
/
Customer Churn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
setwd("C:/Users/Dhawani/Desktop/22.08.19/Project")
getwd()
library(readxl)
library(psych)
library(car)
####################Q1EDA###########
mydata<-read_excel("Cellphone.xlsx", sheet = 2)
View(mydata)
dim(mydata)
head(mydata)
tail(mydata)
names(mydata)
str(mydata)
##All are numeric
attach(mydata)
data.frame(mydata)
class(mydata)
####CHECKING FOR MISSING DATA
sum(is.na(mydata))
rowSums(is.na(mydata))
colSums(is.na(mydata))
##No missing data
describe(mydata)
###univariate Analysis
plot(mydata, col="Blue")
churnrate<-table(mydata$Churn) / nrow(mydata)
churnrate
##over entire dataset , 14.49% of customers churned
## here churn is the response variable and all other are prediction variables
##graphs
boxplot(AccountWeeks, horizontal = TRUE, col="Blue", main = "Boxplot for Number of Weeks customer has had active account")
boxplot(CustServCalls, horizontal = TRUE, col="Blue", main = "Boxplot for Number of calls made to customer service")
boxplot(DayMins, horizontal = TRUE, col="Blue", main = "Boxplot for average day time minutes per month")
boxplot(DayCalls, horizontal = TRUE, col="Blue", main = "Boxplot for average number of daytime calls")
boxplot(OverageFee, horizontal = TRUE, col="Blue", main = "Boxplot for largest overage fees in last 12 months")
boxplot(RoamMins, horizontal = TRUE, col="Blue", main = "Boxplot for average number of Roaming Minutes")
table(mydata2$ContractRenewal)
library(rpivotTable)
rpivotTable(mydata2)
attach(mydata2)
##from pivot table , 1- churn yes no, 2- contract renewal yes no, 3- data plan yes noe
boxplot(CustServCalls~Churn, col="BLue", main = "Boxplot for Churn w.r.t Data Plan")
cor(CustServCalls,Churn)
by(mydata2,INDICES = Churn, FUN=summary)
###Regression
reg<-lm(Churn~.,data = mydata)
summary(reg)
vif(reg)
###VIF greater than 4 will be removed one by one
##removing monthly charge
mydata1<-mydata[!names(mydata) %in% c("MonthlyCharge")]
mydata1
dim(mydata1)
reg1<-lm(Churn~.,data = mydata1)
summary(reg1)
vif(reg1)
#removing data usage
mydata2<-mydata1[!names(mydata1) %in% c("DataUsage")]
mydata2
dim(mydata2)
reg2<-lm(Churn~.,data = mydata2)
summary(reg2)
vif(reg2)
###All VIF's are within range
###Removal of outliers and influential varaibles using cooks distance
cd = cooks.distance(reg2)
max(cd)
plot(cd, pch = "*", cex = 2, main = "Influential obs by cooks distance")
#cutoff will be 0.5
#no need to remove any observation
View(mydata2)
library(corrplot)
corrplot(cor(mydata2), method="shade")
#=====================Question 2=================================
#Buidling Models=================================================
###Logistic Regression
Predict = predict(reg2)
data.frame(Churn, Predict)
##Step 1 : Working out log likelihood ratio test
library(lmtest)
logit= glm(Churn~.,data = mydata2, family = binomial)
lrtest((logit))
##pvalue is very low, less than 0.05 thus it is significant
#thus churn depends on the given variables
##Step 2: Rsquare computatoin and interpretation
install.packages("pscl")
library(pscl)
pR2(logit)
##As MCfadden is in range of 0.10-0.3 , it is good model
##uncertaininty of model 2 is explained by model 1
#model 2 is intercept model
#Step 3- Test of individual coefficients (betas) and interpretation
summary(logit)
##Step 4: Explanatory power of odd ratio
odds = exp(coef(logit))
odds
Probability = odds/(1+odds)
Probability
##Odds and its probability are important
#lets interpret each variable individually
##If attitute towards accountweeks increases by 1 unit then proability of churn increases by 1.00006666 times comapres to not churn keeping other variables cconstant
##If attituted towards churn increases by 1 unit then odds for accountweeks goes up by 50% proability of becoming churn keeping other variables constant
###so now we can say that all factors are significatn ffor predicting churning
##Step 5: Confusion matrix for measuring predictive accuracy
predict(logit, type="response")
##it is giving predicted value of churn for each record
#our aim is to get outcome 0 or 1 and not decimal
attach(mydata2)
Pred = fitted(logit)
data.frame(Churn, Pred)
gg1 = floor(Pred + 0.50)
gg1
table(Actual = Churn , Prediction = gg1)
##2781 + 89 = 2870 correct prediction and 463 wrong classificed
#thus accuracy is 2870/3333 = 86% accuracy
##find accuracy sensitivity and specificity from confusion matrix
##Step 6 : ROC Plot adn interpretation
##From logistic regression we get binary outcomes
library(ROCR)
pred1 = prediction(gg1, Churn)
perf1 = performance(pred1,"tpr", "fpr")
plot(perf1, col = "Blue", main = "ROC Plot")
abline(0,1, lty = 8, col = "red")
auc = performance(pred1, "auc")
auc = auc@y.values
auc
##auc is 0.58
#=======KNN Method
#Choose value of k based on lowest error rate in validation data
##when k=n (that is entire dataset) is the same thing as "naive rule" (classify all recordes according to majority class)
#we use average response values in KNN
View(mydata2)
mydata3<-mydata2[, -1]
View(mydata3)
mydata4<-mydata3[,-2]
View(mydata4)
mydata5<-mydata4[,-2]
View(mydata5)
library(dplyr)
##step 1 normalise the data
norm = function(x) { (x-min(x))/(max(x)-min(x))}
norm.data = as.data.frame(lapply(mydata5,norm))
View(mydata5)
View(norm.data)
usable.data = cbind(mydata2[,1], norm.data)
str(usable.data)
View(usable.data)
# Data partitioning
library(caTools)
spl = sample.split(usable.data$Churn, SplitRatio = 0.7)
train = subset(usable.data, spl == T)
test = subset(usable.data, spl == F)
dim(train)
dim(test)
#Use KNN Classifier
library(class)
pred = knn(train[-1], test[-1], train[,1], k=19)
table.knn = table(test[,1], pred)
table.knn ##this is confusion matrix
sum(diag(table.knn))/sum(table.knn)
pred = knn(train[-1], test[-1], train[,1], k=7)
table.knn = table(test[,1], pred)
table.knn
sum(diag(table.knn))/sum(table.knn)
pred = knn(train[-1], test[-1], train[,1], k=15)
table.knn = table(test[,1], pred)
table.knn
sum(diag(table.knn))/sum(table.knn)
pred = knn(train[-1], test[-1], train[,1], k=13)
table.knn = table(test[,1], pred)
table.knn
sum(diag(table.knn))/sum(table.knn)
pred = knn(train[-1], test[-1], train[,1], k=17)
table.knn = table(test[,1], pred)
table.knn
sum(diag(table.knn))/sum(table.knn)
###Choose value of k which gives highest accuracy
##choose the optimum
##==================Naive Bayes
#Use NB Classifier
library(e1071)
NB = naiveBayes(Churn~., data=train)
predNB = predict(NB, test, type = "class")
tab.NB = table(test[,1], pred)
tab.NB
sum(diag(tab.NB)/sum(tab.NB))
##Accuracy 87.4%
#====================Model Comparison==================
##Cross validation
library(dplyr)
folds <- createFolds(factor(train$Churn), k = 10, list = FALSE)
folds
folds <- createFolds(factor(train$Churn), k = 10, list = TRUE)
folds
library(caret)
library(klaR)
train_control = trainControl(method = "cv", number = 10)
modelKNN = train(as.factor(Churn)~., data=train , trControl = train_control, method = "knn")
summary(modelKNN)
print(modelKNN)
modelNB = train(as.factor(Churn)~., data=train, trControl = train_control, method = "nb")
summary(modelNB)
print(modelNB)
### collect resamples
results = resamples(list(KNN = modelKNN, NB = modelNB))
summary(results)
bwplot(results)
###KNN model has higher median in accuracy
##kappa is also mode in KNN then NB