Load Data
Dataset is available here.
# Loading the data here
bank_loan_df <- read_sav("P4_bankloan_5000_clients.sav")
## tibble [5,000 x 9] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:5000] 41 30 40 41 57 45 36 39 43 34 ...
## ..- attr(*, "label")= chr "Age in years"
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 6
## $ education_level : Factor w/ 5 levels "1","2","3","4",..: 3 1 1 1 1 1 1 1 1 3 ...
## $ current_employ_year : num [1:5000] 17 13 15 15 7 0 1 20 12 7 ...
## ..- attr(*, "label")= chr "Years with current employer"
## ..- attr(*, "format.spss")= chr "F4.0"
## $ current_address_year: num [1:5000] 12 8 14 14 37 13 3 9 11 12 ...
## ..- attr(*, "label")= chr "Years at current address"
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 9
## $ income_household : num [1:5000] 35.9 46.7 61.8 72 25.6 28.1 19.6 80.5 68.7 33.8 ...
## ..- attr(*, "label")= chr "Household income in thousands"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ debt_income_ratio : num [1:5000] 11.9 17.9 10.6 29.7 15.9 ...
## ..- attr(*, "label")= chr "Debt to income ratio (x100)"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ credit_card_debt : num [1:5000] 0.504 1.353 3.439 4.166 1.498 ...
## ..- attr(*, "label")= chr "Credit card debt in thousands"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ other_debts : num [1:5000] 3.77 7 3.14 17.2 2.56 ...
## ..- attr(*, "label")= chr "Other debt in thousands"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ defaulted_loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## - attr(*, "label")= chr "Bank Loan Default -- Binning"
## - attr(*, "notes")= chr [1:7] "DOCUMENT This is a hypothetical data file that concerns a bank's efforts to redu" " ce" "the rate of loan defaults. This file contains financial and demographic" "information on 5000 past customers that the bank will use to create binning rule" ...
This is a hypothetical data file that concerns a bank’s efforts to reduce the rate of loan defaults. This file contains financial and demographic information on 5000 past customers that the bank will use
to create binning rule.
Train Test Validation
Is not this step a universal step in ML?
## Call:
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5859 -0.6588 -0.3438 0.1138 3.3020
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.169417 0.268244 -4.360 1.30e-05 ***
## age 0.004410 0.008174 0.540 0.5895
## education_level2 0.224490 0.108351 2.072 0.0383 *
## education_level3 0.259264 0.153824 1.685 0.0919 .
## education_level4 0.250029 0.185073 1.351 0.1767
## education_level5 0.018646 0.446741 0.042 0.9667
## current_employ_year -0.182293 0.012469 -14.619 < 2e-16 ***
## current_address_year -0.092239 0.010140 -9.096 < 2e-16 ***
## income_household -0.003279 0.003835 -0.855 0.3925
## debt_income_ratio 0.099422 0.012702 7.827 4.98e-15 ***
## credit_card_debt 0.425010 0.043483 9.774 < 2e-16 ***
## other_debts 0.013697 0.030109 0.455 0.6492
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Dispersion parameter for binomial family taken to be 1)
## Null deviance: 4124.2 on 3650 degrees of freedom
## Residual deviance: 2946.7 on 3639 degrees of freedom
## AIC: 2970.7
## Number of Fisher Scoring iterations: 6
Testing the Logistic model
pred1 <- predict(logic_model, test_data)
Confusion Matrix
confusionMatrix(pred1, test_data$defaulted_loan)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 944 177
## 1 70 158
## Accuracy : 0.8169
## 95% CI : (0.7952, 0.8372)
## No Information Rate : 0.7517
## P-Value [Acc > NIR] : 6.180e-09
## Kappa : 0.4508
## Mcnemar's Test P-Value : 1.534e-11
## Sensitivity : 0.9310
## Specificity : 0.4716
## Pos Pred Value : 0.8421
## Neg Pred Value : 0.6930
## Prevalence : 0.7517
## Detection Rate : 0.6998
## Detection Prevalence : 0.8310
## Balanced Accuracy : 0.7013
## 'Positive' Class : 0
KNN Model with train/test validation
knn_model<-train(defaulted_loan~.,data = train_data,
preProcess = c("center", "scale"),
tuneLength = 10
Obtain the result
## k Accuracy Kappa AccuracySD KappaSD
## 1 5 0.7454197 0.2900344 0.011038244 0.02396152
## 2 7 0.7580398 0.3100812 0.011263579 0.02591339
## 3 9 0.7653001 0.3192384 0.012598023 0.02663501
## 4 11 0.7699141 0.3216275 0.012504271 0.02653078
## 5 13 0.7726898 0.3244535 0.013151810 0.03094986
## 6 15 0.7762592 0.3283860 0.012201179 0.02642366
## 7 17 0.7777579 0.3261075 0.012592331 0.02979285
## 8 19 0.7800209 0.3287250 0.010386847 0.02551861
## 9 21 0.7818723 0.3300842 0.009801227 0.02492881
## 10 23 0.7831656 0.3305184 0.009523488 0.02541875
Testing the model
pred2 <- predict(knn_model, test_data)
Confusion Matrix
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 949 218
## 1 65 117
## Accuracy : 0.7902
## 95% CI : (0.7675, 0.8117)
## No Information Rate : 0.7517
## P-Value [Acc > NIR] : 0.0004827
## Kappa : 0.3366
## Mcnemar's Test P-Value : < 2.2e-16
## Sensitivity : 0.9359
## Specificity : 0.3493
## Pos Pred Value : 0.8132
## Neg Pred Value : 0.6429
## Prevalence : 0.7517
## Detection Rate : 0.7035
## Detection Prevalence : 0.8651
## Balanced Accuracy : 0.6426
## 'Positive' Class : 0
Fitting Naive Bayes
Training the model
naive_model <- naiveBayes(defaulted_loan~., train_data)
## Length Class Mode
## apriori 2 table numeric
## tables 8 -none- list
## levels 2 -none- character
## isnumeric 8 -none- logical
## call 4 -none- call
Testing the model
pred3 <- predict(naive_model, test_data)
Confusion Matrix
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 974 260
## 1 40 75
## Accuracy : 0.7776
## 95% CI : (0.7545, 0.7996)
## No Information Rate : 0.7517
## P-Value [Acc > NIR] : 0.01408
## Kappa : 0.2364
## Mcnemar's Test P-Value : < 2e-16
## Sensitivity : 0.9606
## Specificity : 0.2239
## Pos Pred Value : 0.7893
## Neg Pred Value : 0.6522
## Prevalence : 0.7517
## Detection Rate : 0.7220
## Detection Prevalence : 0.9148
## Balanced Accuracy : 0.5922
## 'Positive' Class : 0
Support Vector Machine (SVM) Model
Training the model
svm_model <- svm(formula= defaulted_loan~., data = train_data, type = "C-classification", kernel= "linear")
## Call:
## svm(formula = defaulted_loan ~ ., data = train_data, type = "C-classification",
## kernel = "linear")
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## Number of Support Vectors: 1614
## ( 810 804 )
## Number of Classes: 2
## Levels:
## 0 1
Testing the Model
pred4 <- predict(svm_model, test_data)
Confusion Matrix
confusionMatrix(pred4, test_data$defaulted_loan)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 960 200
## 1 54 135
## Accuracy : 0.8117
## 95% CI : (0.7898, 0.8322)
## No Information Rate : 0.7517
## P-Value [Acc > NIR] : 8.805e-08
## Kappa : 0.4095
## Mcnemar's Test P-Value : < 2.2e-16
## Sensitivity : 0.9467
## Specificity : 0.4030
## Pos Pred Value : 0.8276
## Neg Pred Value : 0.7143
## Prevalence : 0.7517
## Detection Rate : 0.7116
## Detection Prevalence : 0.8599
## Balanced Accuracy : 0.6749
## 'Positive' Class : 0
Decision Tree Model
Training the model
decision_tree_model <-train(defaulted_loan~.,
data = train_data,
parms = list(split = "information"),
Testing the model
pred5 <- predict(decision_tree_model, test_data)
Confusion Matrix
confusionMatrix(pred5, test_data$defaulted_loan)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 956 235
## 1 58 100
## Accuracy : 0.7828
## 95% CI : (0.7598, 0.8045)
## No Information Rate : 0.7517
## P-Value [Acc > NIR] : 0.004044
## Kappa : 0.2932
## Mcnemar's Test P-Value : < 2.2e-16
## Sensitivity : 0.9428
## Specificity : 0.2985
## Pos Pred Value : 0.8027
## Neg Pred Value : 0.6329
## Prevalence : 0.7517
## Detection Rate : 0.7087
## Detection Prevalence : 0.8829
## Balanced Accuracy : 0.6207
## 'Positive' Class : 0
Artifical Neural Network (ANN) Model
Training the Model
ann_model <- train(defaulted_loan ~ ., data = train_data,
method = "nnet",
preProcess = c("center","scale"),
maxit = 250, # Maximum number of iterations
tuneGrid = data.frame(size = 1, decay = 0),
# tuneGrid = data.frame(size = 0, decay = 0),skip=TRUE, # Technically, this is log-reg
metric = "Accuracy")
Testing the model
pred6 <- predict(ann_model, test_data)
Confusion Matrix
confusionMatrix(pred6, test_data$defaulted_loan)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 941 173
## 1 73 162
## Accuracy : 0.8176
## 95% CI : (0.796, 0.8379)
## No Information Rate : 0.7517
## P-Value [Acc > NIR] : 4.148e-09
## Kappa : 0.4573
## Mcnemar's Test P-Value : 2.754e-10
## Sensitivity : 0.9280
## Specificity : 0.4836
## Pos Pred Value : 0.8447
## Neg Pred Value : 0.6894
## Prevalence : 0.7517
## Detection Rate : 0.6976
## Detection Prevalence : 0.8258
## Balanced Accuracy : 0.7058
## 'Positive' Class : 0
Leave one Out Validation
Read the data
bank_loan_df <- read_sav("P4_bankloan_5000_clients.sav")
Logistic Regression With LOOCV Validation
Training Logistic Regression Model
ind<-sample(2,nrow(bank_loan_df),replace=T,prob = c(0.7,0.3))
Setting Up the Train Control
loocv_train_control<-trainControl(method = "LOOCV")
Logistic Regression With LOOCV Validation
Training Logistic Regression Model
# )
KNN Model with LOOCV validation
Training KNN Model
knn_clf1<-train(defaulted_loan~.,data = train_data,
Obtain the result
## k Accuracy Kappa
## 1 5 0.7636879 0.3087625
## 2 7 0.7707801 0.3112221
## 3 9 0.7770213 0.3248772
Confusion Matrix for Model Evaluation
predicted_val_knn1<-predict(knn_clf1,newdata = test_data)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 1018 226
## 1 96 135
## Accuracy : 0.7817
## 95% CI : (0.7597, 0.8025)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 0.009238
## Kappa : 0.3277
## Mcnemar's Test P-Value : 6.532e-13
## Sensitivity : 0.9138
## Specificity : 0.3740
## Pos Pred Value : 0.8183
## Neg Pred Value : 0.5844
## Prevalence : 0.7553
## Detection Rate : 0.6902
## Detection Prevalence : 0.8434
## Balanced Accuracy : 0.6439
## 'Positive' Class : 0
Naïve Bayes classifier
Training the Model
usepoisson = TRUE,
Making Prediction on Test Data
predicted_val_nb1<-predict(nb_clf1,newdata = test_data)
Confusion Matrix for Model Evaluation
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 1094 308
## 1 20 53
## Accuracy : 0.7776
## 95% CI : (0.7555, 0.7986)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 0.02363
## Kappa : 0.1764
## Mcnemar's Test P-Value : < 2e-16
## Sensitivity : 0.9820
## Specificity : 0.1468
## Pos Pred Value : 0.7803
## Neg Pred Value : 0.7260
## Prevalence : 0.7553
## Detection Rate : 0.7417
## Detection Prevalence : 0.9505
## Balanced Accuracy : 0.5644
## 'Positive' Class : 0
K-Fold Cross Validation
Reading the File
bank_loan_df <- read_sav("P4_bankloan_5000_clients.sav")
Changing the data type of variables
Splitting the data into train and test set
ind<-sample(2,nrow(bank_loan_df),replace=T,prob = c(0.7,0.3))
Setting Up the Train Control
cv_train_control<-trainControl(method = "cv",number = 10)
Logistic Regression With Cross Validation
Training Logistic Regression Model
## Call:
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6490 -0.6635 -0.3442 0.1409 3.2833
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.235986 0.272446 -4.537 5.72e-06 ***
## age 0.006492 0.008297 0.782 0.4339
## education_level2 0.227329 0.110244 2.062 0.0392 *
## education_level3 0.260781 0.156468 1.667 0.0956 .
## education_level4 0.285038 0.186776 1.526 0.1270
## education_level5 0.020994 0.447370 0.047 0.9626
## current_employ_year -0.182777 0.012678 -14.416 < 2e-16 ***
## current_address_year -0.094317 0.010300 -9.157 < 2e-16 ***
## income_household -0.002470 0.003879 -0.637 0.5244
## debt_income_ratio 0.099652 0.012885 7.734 1.04e-14 ***
## credit_card_debt 0.425066 0.044558 9.540 < 2e-16 ***
## other_debts 0.006704 0.030495 0.220 0.8260
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Dispersion parameter for binomial family taken to be 1)
## Null deviance: 3994.4 on 3524 degrees of freedom
## Residual deviance: 2850.2 on 3513 degrees of freedom
## AIC: 2874.2
## Number of Fisher Scoring iterations: 6
Making the Prediction
predicted_val_log1<-predict(logistic_clf1,newdata = test_data)
Confusion Matrix for Evaluation
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 1038 191
## 1 76 170
## Accuracy : 0.819
## 95% CI : (0.7984, 0.8383)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 2.487e-09
## Kappa : 0.4513
## Mcnemar's Test P-Value : 3.022e-12
## Sensitivity : 0.9318
## Specificity : 0.4709
## Pos Pred Value : 0.8446
## Neg Pred Value : 0.6911
## Prevalence : 0.7553
## Detection Rate : 0.7037
## Detection Prevalence : 0.8332
## Balanced Accuracy : 0.7013
## 'Positive' Class : 0
KNN Model with Cross validation
Training KNN Model
knn_clf1<-train(defaulted_loan~.,data = train_data,
Getting the Result of the Model
## k Accuracy Kappa AccuracySD KappaSD
## 1 5 0.7668056 0.3138335 0.01709039 0.03827953
## 2 7 0.7727611 0.3210782 0.01581367 0.03762291
## 3 9 0.7744568 0.3184971 0.01934934 0.05507607
Confusion Matrix for Model Evaluation
predicted_val_knn1<-predict(knn_clf1,newdata = test_data)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 1019 226
## 1 95 135
## Accuracy : 0.7824
## 95% CI : (0.7604, 0.8032)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 0.007801
## Kappa : 0.329
## Mcnemar's Test P-Value : 3.99e-13
## Sensitivity : 0.9147
## Specificity : 0.3740
## Pos Pred Value : 0.8185
## Neg Pred Value : 0.5870
## Prevalence : 0.7553
## Detection Rate : 0.6908
## Detection Prevalence : 0.8441
## Balanced Accuracy : 0.6443
## 'Positive' Class : 0
Naïve Bayes classifier
Training the Model
## Warning: package 'naivebayes' was built under R version 4.1.2
## naivebayes 0.9.7 loaded
usepoisson = TRUE,
## ================================== Naive Bayes ==================================
## - Call: naive_bayes.default(x = x, y = y, laplace = param$laplace, usekernel = TRUE, usepoisson = TRUE, adjust = param$adjust)
## - Laplace: 0
## - Classes: 2
## - Samples: 3525
## - Features: 11
## - Conditional distributions:
## - KDE: 11
## - Prior probabilities:
## - 0: 0.7461
## - 1: 0.2539
## ---------------------------------------------------------------------------------
Making Prediction on Test Data
predicted_val_nb1<-predict(nb_clf1,newdata = test_data)
Confusion Matrix for Model Evaluation
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 1094 308
## 1 20 53
## Accuracy : 0.7776
## 95% CI : (0.7555, 0.7986)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 0.02363
## Kappa : 0.1764
## Mcnemar's Test P-Value : < 2e-16
## Sensitivity : 0.9820
## Specificity : 0.1468
## Pos Pred Value : 0.7803
## Neg Pred Value : 0.7260
## Prevalence : 0.7553
## Detection Rate : 0.7417
## Detection Prevalence : 0.9505
## Balanced Accuracy : 0.5644
## 'Positive' Class : 0
Bagging, Boosting and Random Forest
Reading the File
bank_loan_df <- read_sav("P4_bankloan_5000_clients.sav")
Splitting the data into train and test set
## Loading required package: ggplot2
## Loading required package: lattice
ind<-sample(2,nrow(bank_loan_df),replace=T,prob = c(0.7,0.3))
Bagging Model
Training the Model
## Warning: package 'ipred' was built under R version 4.1.2
data = train_data,
## Bagging classification trees with 25 bootstrap replications
## Call: bagging.data.frame(formula = defaulted_loan ~ ., data = train_data,
## coob = T)
## Out-of-bag estimate of misclassification error: 0.2295
Making the Prediction
predicted_bag_tree<-predict(bag_dtree_clf,newdata = test_data)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 991 191
## 1 123 170
## Accuracy : 0.7871
## 95% CI : (0.7653, 0.8078)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 0.0021549
## Kappa : 0.385
## Mcnemar's Test P-Value : 0.0001562
## Sensitivity : 0.8896
## Specificity : 0.4709
## Pos Pred Value : 0.8384
## Neg Pred Value : 0.5802
## Prevalence : 0.7553
## Detection Rate : 0.6719
## Detection Prevalence : 0.8014
## Balanced Accuracy : 0.6803
## 'Positive' Class : 0
Random Forest Model
Training the Model
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## margin
data = train_data)
## Call:
## randomForest(formula = defaulted_loan ~ ., data = train_data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
## OOB estimate of error rate: 20.88%
## Confusion matrix:
## 0 1 class.error
## 0 2420 210 0.07984791
## 1 526 369 0.58770950
Making the Prediction
predicted_rf<-predict(rf_clf,newdata = test_data)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 1023 197
## 1 91 164
## Accuracy : 0.8047
## 95% CI : (0.7836, 0.8247)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 3.459e-06
## Kappa : 0.4137
## Mcnemar's Test P-Value : 6.125e-10
## Sensitivity : 0.9183
## Specificity : 0.4543
## Pos Pred Value : 0.8385
## Neg Pred Value : 0.6431
## Prevalence : 0.7553
## Detection Rate : 0.6936
## Detection Prevalence : 0.8271
## Balanced Accuracy : 0.6863
## 'Positive' Class : 0
Extreme Gradient Boosting
Training the Model
#data = train_data,
# )
Making the Prediction
#predicted_xgb<-predict(xglm_clf,newdata = test_data)