The CART (Classification and Regression Trees) algorithm is a decision tree method. CART is a popular algorithm used for both classification and regression problems. For our classification task, it constructs a binary tree in which each internal node represents a test on a single feature, and each leaf node represents a class label or a numeric value. The splitting of nodes in the tree is based on a measure of impurity such as Gini impurity or entropy. The CART algorithm is often used in applications such as finance, marketing, and healthcare.

Model Construction

Show/Hide Code
#----------------------#
#----Decision Tree-----#
#----------------------#
set.seed(1234)
train_control <- trainControl(method = "cv", number = 10)

set.seed(1234)
dc_model <- train(good ~ ., 
                  data = train, 
                  method = "rpart2", 
                  trControl = train_control,
                  na.action = na.omit)

save(dc_model, file = "dataset\\model\\dc.model_kfoldCV.Rdata")


#----------------------------#
#----Decision Tree (Mod)-----#
#----------------------------#
set.seed(1234)
train_control <- trainControl(method = "cv", number = 10)

set.seed(1234)
dc_model <- train(good ~ ., 
                  data = train, 
                  method = "rpart", 
                  trControl = train_control,
                  tuneLength = 5,
                  tuneGrid = data.frame(cp = seq(0.001, 0.1, by = 0.005)))

save(dc_model, file = "dataset\\model\\dc.model_kfoldCV_mod.Rdata")

K-fold CV

Show/Hide Code
# Data Import
load("dataset\\wine.data_cleaned.Rdata")
load("dataset\\train.Rdata")
load("dataset\\test.Rdata")

# Function Import
load("dataset\\function\\accu.kappa.plot.Rdata")

# Model import
load("dataset\\model\\dc.model_kfoldCV.Rdata")

dc.predictions <- predict(dc_model, newdata = test)

confusionMatrix(dc.predictions, test$good)
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 860 128
         1  89 111
                                          
               Accuracy : 0.8173          
                 95% CI : (0.7942, 0.8389)
    No Information Rate : 0.7988          
    P-Value [Acc > NIR] : 0.058574        
                                          
                  Kappa : 0.3947          
                                          
 Mcnemar's Test P-Value : 0.009891        
                                          
            Sensitivity : 0.9062          
            Specificity : 0.4644          
         Pos Pred Value : 0.8704          
         Neg Pred Value : 0.5550          
             Prevalence : 0.7988          
         Detection Rate : 0.7239          
   Detection Prevalence : 0.8316          
      Balanced Accuracy : 0.6853          
                                          
       'Positive' Class : 0               
                                          
Show/Hide Code
dc.predictions <- as.numeric(dc.predictions)
pred_obj <- prediction(dc.predictions, test$good)
auc_val <- performance(pred_obj, "auc")@y.values[[1]]
auc_val
[1] 0.6853261
Show/Hide Code
roc_obj <- performance(pred_obj, "tpr", "fpr")
plot(roc_obj, colorize = TRUE, lwd = 2,
     xlab = "False Positive Rate", 
     ylab = "True Positive Rate",
     main = "CART (10-fold CV)")
abline(a = 0, b = 1)
x_values <- as.numeric(unlist(roc_obj@x.values))
y_values <- as.numeric(unlist(roc_obj@y.values))
polygon(x = x_values, y = y_values, 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
polygon(x = c(0, 1, 1), y = c(0, 0, 1), 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
text(0.6, 0.4, paste("AUC =", round(auc_val, 4)))
Show/Hide Code
dc.kfoldCV.ROC.plot <- recordPlot()

dc_df <- data.frame(k= dc_model$results$maxdepth,
                    Accuracy=dc_model$results$Accuracy,
                    Kappa=dc_model$results$Kappa)

dc.kfoldCV.plot <- accu.kappa.plot(dc_df) + 
  geom_text(aes(x = k, y = Accuracy, label = round(Accuracy, 3)), hjust = -0.3, angle=90) +
  geom_text(aes(x = k, y = Kappa, label = round(Kappa, 3)), hjust = -0.3, angle=90) +
  labs(x="Max Depth")
  ggtitle("CART Model Performance")
$title
[1] "CART Model Performance"

attr(,"class")
[1] "labels"
Show/Hide Code
pander::pander(dc_model$results)
maxdepth Accuracy Kappa AccuracySD KappaSD
3 0.7967 0.2956 0.02037 0.0916
5 0.7992 0.247 0.01744 0.07917
9 0.797 0.2773 0.01861 0.0978

Tuned

Show/Hide Code
# Model Import
load("dataset\\model\\dc.model_kfoldCV_mod.Rdata")

dc.predictions <- predict(dc_model, newdata = test)

confusionMatrix(dc.predictions, test$good)
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 865 131
         1  84 108
                                          
               Accuracy : 0.819           
                 95% CI : (0.7959, 0.8405)
    No Information Rate : 0.7988          
    P-Value [Acc > NIR] : 0.043152        
                                          
                  Kappa : 0.3922          
                                          
 Mcnemar's Test P-Value : 0.001706        
                                          
            Sensitivity : 0.9115          
            Specificity : 0.4519          
         Pos Pred Value : 0.8685          
         Neg Pred Value : 0.5625          
             Prevalence : 0.7988          
         Detection Rate : 0.7281          
   Detection Prevalence : 0.8384          
      Balanced Accuracy : 0.6817          
                                          
       'Positive' Class : 0               
                                          
Show/Hide Code
dc.predictions <- as.numeric(dc.predictions)
pred_obj <- prediction(dc.predictions, test$good)
auc_val <- performance(pred_obj, "auc")@y.values[[1]]
auc_val
[1] 0.6816843
Show/Hide Code
roc_obj <- performance(pred_obj, "tpr", "fpr")
plot(roc_obj, colorize = TRUE, lwd = 2,
     xlab = "False Positive Rate", 
     ylab = "True Positive Rate",
     main = "CART Tuned (10-fold CV)")
abline(a = 0, b = 1)
x_values <- as.numeric(unlist(roc_obj@x.values))
y_values <- as.numeric(unlist(roc_obj@y.values))
polygon(x = x_values, y = y_values, 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
polygon(x = c(0, 1, 1), y = c(0, 0, 1), 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
text(0.6, 0.4, paste("AUC =", round(auc_val, 4)))
Show/Hide Code
dc.kfoldCV_mod.ROC.plot <- recordPlot()

pander::pander(dc_model$results)
cp Accuracy Kappa AccuracySD KappaSD
0.001 0.7808 0.318 0.02771 0.07905
0.006 0.7995 0.2971 0.01856 0.07965
0.011 0.801 0.2697 0.01846 0.09344
0.016 0.7937 0.2546 0.01505 0.08292
0.021 0.7988 0.2552 0.01681 0.07696
0.026 0.8006 0.2917 0.01855 0.089
0.031 0.7909 0.1761 0.01035 0.1383
0.036 0.7833 0.008063 0.003631 0.0255
0.041 0.7833 0.008063 0.003631 0.0255
0.046 0.7833 0.008063 0.003631 0.0255
0.051 0.7844 0 0.001053 0
0.056 0.7844 0 0.001053 0
0.061 0.7844 0 0.001053 0
0.066 0.7844 0 0.001053 0
0.071 0.7844 0 0.001053 0
0.076 0.7844 0 0.001053 0
0.081 0.7844 0 0.001053 0
0.086 0.7844 0 0.001053 0
0.091 0.7844 0 0.001053 0
0.096 0.7844 0 0.001053 0

Summary

Show/Hide Code
cowplot::plot_grid(dc.kfoldCV.ROC.plot, dc.kfoldCV_mod.ROC.plot, 
                   ncol = 2, align = "hv", scale = 0.8)

Model Error Rate Sensitivity Specificity AUC
CART 0.1827 0.9062 0.4644 0.6853261
CART (Tuned) 0.1810 0.9115 0.4519 0.6816843