Decision Tree

The CART (Classification and Regression Trees) algorithm is a decision tree method. CART is a popular algorithm used for both classification and regression problems. For our classification task, it constructs a binary tree in which each internal node represents a test on a single feature, and each leaf node represents a class label or a numeric value. The splitting of nodes in the tree is based on a measure of impurity such as Gini impurity or entropy. The CART algorithm is often used in applications such as finance, marketing, and healthcare.

Model Construction

Show/Hide Code

#----------------------#
#----Decision Tree-----#
#----------------------#
set.seed(1234)
train_control <- trainControl(method = "cv", number = 10)

set.seed(1234)
dc_model <- train(good ~ ., 
                  data = train, 
                  method = "rpart2", 
                  trControl = train_control,
                  na.action = na.omit)

save(dc_model, file = "dataset\\model\\dc.model_kfoldCV.Rdata")


#----------------------------#
#----Decision Tree (Mod)-----#
#----------------------------#
set.seed(1234)
train_control <- trainControl(method = "cv", number = 10)

set.seed(1234)
dc_model <- train(good ~ ., 
                  data = train, 
                  method = "rpart", 
                  trControl = train_control,
                  tuneLength = 5,
                  tuneGrid = data.frame(cp = seq(0.001, 0.1, by = 0.005)))

save(dc_model, file = "dataset\\model\\dc.model_kfoldCV_mod.Rdata")

K-fold CV

Show/Hide Code

# Data Import
load("dataset\\wine.data_cleaned.Rdata")
load("dataset\\train.Rdata")
load("dataset\\test.Rdata")

# Function Import
load("dataset\\function\\accu.kappa.plot.Rdata")

# Model import
load("dataset\\model\\dc.model_kfoldCV.Rdata")

dc.predictions <- predict(dc_model, newdata = test)

confusionMatrix(dc.predictions, test$good)

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 860 128
         1  89 111
                                          
               Accuracy : 0.8173          
                 95% CI : (0.7942, 0.8389)
    No Information Rate : 0.7988          
    P-Value [Acc > NIR] : 0.058574        
                                          
                  Kappa : 0.3947          
                                          
 Mcnemar's Test P-Value : 0.009891        
                                          
            Sensitivity : 0.9062          
            Specificity : 0.4644          
         Pos Pred Value : 0.8704          
         Neg Pred Value : 0.5550          
             Prevalence : 0.7988          
         Detection Rate : 0.7239          
   Detection Prevalence : 0.8316          
      Balanced Accuracy : 0.6853          
                                          
       'Positive' Class : 0

Show/Hide Code

dc.predictions <- as.numeric(dc.predictions)
pred_obj <- prediction(dc.predictions, test$good)
auc_val <- performance(pred_obj, "auc")@y.values[[1]]
auc_val

[1] 0.6853261

Show/Hide Code

roc_obj <- performance(pred_obj, "tpr", "fpr")
plot(roc_obj, colorize = TRUE, lwd = 2,
     xlab = "False Positive Rate", 
     ylab = "True Positive Rate",
     main = "CART (10-fold CV)")
abline(a = 0, b = 1)
x_values <- as.numeric(unlist(roc_obj@x.values))
y_values <- as.numeric(unlist(roc_obj@y.values))
polygon(x = x_values, y = y_values, 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
polygon(x = c(0, 1, 1), y = c(0, 0, 1), 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
text(0.6, 0.4, paste("AUC =", round(auc_val, 4)))

Show/Hide Code

dc.kfoldCV.ROC.plot <- recordPlot()

dc_df <- data.frame(k= dc_model$results$maxdepth,
                    Accuracy=dc_model$results$Accuracy,
                    Kappa=dc_model$results$Kappa)

dc.kfoldCV.plot <- accu.kappa.plot(dc_df) + 
  geom_text(aes(x = k, y = Accuracy, label = round(Accuracy, 3)), hjust = -0.3, angle=90) +
  geom_text(aes(x = k, y = Kappa, label = round(Kappa, 3)), hjust = -0.3, angle=90) +
  labs(x="Max Depth")
  ggtitle("CART Model Performance")

$title
[1] "CART Model Performance"

attr(,"class")
[1] "labels"

Show/Hide Code

pander::pander(dc_model$results)

maxdepth	Accuracy	Kappa	AccuracySD	KappaSD
3	0.7967	0.2956	0.02037	0.0916
5	0.7992	0.247	0.01744	0.07917
9	0.797	0.2773	0.01861	0.0978

Tuned

Show/Hide Code

# Model Import
load("dataset\\model\\dc.model_kfoldCV_mod.Rdata")

dc.predictions <- predict(dc_model, newdata = test)

confusionMatrix(dc.predictions, test$good)

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 865 131
         1  84 108
                                          
               Accuracy : 0.819           
                 95% CI : (0.7959, 0.8405)
    No Information Rate : 0.7988          
    P-Value [Acc > NIR] : 0.043152        
                                          
                  Kappa : 0.3922          
                                          
 Mcnemar's Test P-Value : 0.001706        
                                          
            Sensitivity : 0.9115          
            Specificity : 0.4519          
         Pos Pred Value : 0.8685          
         Neg Pred Value : 0.5625          
             Prevalence : 0.7988          
         Detection Rate : 0.7281          
   Detection Prevalence : 0.8384          
      Balanced Accuracy : 0.6817          
                                          
       'Positive' Class : 0

Show/Hide Code

dc.predictions <- as.numeric(dc.predictions)
pred_obj <- prediction(dc.predictions, test$good)
auc_val <- performance(pred_obj, "auc")@y.values[[1]]
auc_val

[1] 0.6816843

Show/Hide Code

roc_obj <- performance(pred_obj, "tpr", "fpr")
plot(roc_obj, colorize = TRUE, lwd = 2,
     xlab = "False Positive Rate", 
     ylab = "True Positive Rate",
     main = "CART Tuned (10-fold CV)")
abline(a = 0, b = 1)
x_values <- as.numeric(unlist(roc_obj@x.values))
y_values <- as.numeric(unlist(roc_obj@y.values))
polygon(x = x_values, y = y_values, 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
polygon(x = c(0, 1, 1), y = c(0, 0, 1), 
        col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3),
        border = NA)
text(0.6, 0.4, paste("AUC =", round(auc_val, 4)))

Show/Hide Code

dc.kfoldCV_mod.ROC.plot <- recordPlot()

pander::pander(dc_model$results)

cp	Accuracy	Kappa	AccuracySD	KappaSD
0.001	0.7808	0.318	0.02771	0.07905
0.006	0.7995	0.2971	0.01856	0.07965
0.011	0.801	0.2697	0.01846	0.09344
0.016	0.7937	0.2546	0.01505	0.08292
0.021	0.7988	0.2552	0.01681	0.07696
0.026	0.8006	0.2917	0.01855	0.089
0.031	0.7909	0.1761	0.01035	0.1383
0.036	0.7833	0.008063	0.003631	0.0255
0.041	0.7833	0.008063	0.003631	0.0255
0.046	0.7833	0.008063	0.003631	0.0255
0.051	0.7844	0	0.001053	0
0.056	0.7844	0	0.001053	0
0.061	0.7844	0	0.001053	0
0.066	0.7844	0	0.001053	0
0.071	0.7844	0	0.001053	0
0.076	0.7844	0	0.001053	0
0.081	0.7844	0	0.001053	0
0.086	0.7844	0	0.001053	0
0.091	0.7844	0	0.001053	0
0.096	0.7844	0	0.001053	0

Summary

Show/Hide Code

cowplot::plot_grid(dc.kfoldCV.ROC.plot, dc.kfoldCV_mod.ROC.plot, 
                   ncol = 2, align = "hv", scale = 0.8)

Model	Error Rate	Sensitivity	Specificity	AUC
CART	0.1827	0.9062	0.4644	0.6853261
CART (Tuned)	0.1810	0.9115	0.4519	0.6816843

--- title: "Decision Tree" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(fig.align = TRUE) library(tidyverse) # Load core packages: # ggplot2, for data visualization. # dplyr, for data manipulation. # tidyr, for data tidying. # purrr, for functional programming. # tibble, for tibbles, a modern re-imagining of data frames. # stringr, for strings. # forcats, for factors. # lubridate, for date/times. # readr, for reading .csv, .tsv, and .fwf files. # readxl, for reading .xls, and .xlxs files. # feather, for sharing with Python and other languages. # haven, for SPSS, SAS and Stata files. # httr, for web apis. # jsonlite for JSON. # rvest, for web scraping. # xml2, for XML. # modelr, for modelling within a pipeline # broom, for turning models into tidy data # hms, for times. library(magrittr) # Pipeline operator library(lobstr) # Visualizing abstract syntax trees, stack trees, and object sizes library(pander) # Exporting/converting complex pandoc documents, EX: df to Pandoc table library(ggforce) # More plot functions on top of ggplot2 library(ggpubr) # Automatically add p-values and significance levels plots. # Arrange and annotate multiple plots on the same page. # Change graphical parameters such as colors and labels. library(sf) # Geo-spatial vector manipulation: points, lines, polygons library(kableExtra) # Generate 90 % of complex/advanced/self-customized/beautiful tables library(cowplot) # Multiple plots arrangement library(gridExtra) # Multiple plots arrangement library(animation) # Animated figure container library(latex2exp) # Latex axis titles in ggplot2 library(ellipse) # Simultaneous confidence interval region to check C.I. of 2 slope parameters library(plotly) # User interactive plots library(ellipse) # Simultaneous confidence interval region to check C.I. of 2 regressors library(olsrr) # Model selections library(leaps) # Regression subsetting library(pls) # Partial Least squares library(MASS) # LDA, QDA, OLS, Ridge Regression, Box-Cox, stepAIC, etc,. library(e1071) # Naive Bayesian Classfier,SVM, GKNN, ICA, LCA library(class) # KNN, SOM, LVQ library(ROCR) # Precision/Recall/Sensitivity/Specificity performance plot library(boot) # LOOCV, Bootstrap, library(caret) # Classification/Regression Training, run ?caret::trainControl library(corrgram) # for correlation matrix library(corrplot) # for graphical display of correlation matrix set.seed(1234) # make random results reproducible current_dir <- getwd() if (!is.null(current_dir)) { setwd(current_dir) remove(current_dir) } ``` The CART (Classification and Regression Trees) algorithm is a decision tree method. CART is a popular algorithm used for both classification and regression problems. For our classification task, it constructs a binary tree in which each internal node represents a test on a single feature, and each leaf node represents a class label or a numeric value. The splitting of nodes in the tree is based on a measure of impurity such as Gini impurity or entropy. The CART algorithm is often used in applications such as finance, marketing, and healthcare. ## Model Construction ```{r nb.model_savings, eval=FALSE} #----------------------# #----Decision Tree-----# #----------------------# set.seed(1234) train_control <- trainControl(method = "cv", number = 10) set.seed(1234) dc_model <- train(good ~ ., data = train, method = "rpart2", trControl = train_control, na.action = na.omit) save(dc_model, file = "dataset\\model\\dc.model_kfoldCV.Rdata") #----------------------------# #----Decision Tree (Mod)-----# #----------------------------# set.seed(1234) train_control <- trainControl(method = "cv", number = 10) set.seed(1234) dc_model <- train(good ~ ., data = train, method = "rpart", trControl = train_control, tuneLength = 5, tuneGrid = data.frame(cp = seq(0.001, 0.1, by = 0.005))) save(dc_model, file = "dataset\\model\\dc.model_kfoldCV_mod.Rdata") ``` ## K-fold CV ```{r dc.kfoldCV, fig.show='hide'} # Data Import load("dataset\\wine.data_cleaned.Rdata") load("dataset\\train.Rdata") load("dataset\\test.Rdata") # Function Import load("dataset\\function\\accu.kappa.plot.Rdata") # Model import load("dataset\\model\\dc.model_kfoldCV.Rdata") dc.predictions <- predict(dc_model, newdata = test) confusionMatrix(dc.predictions, test$good) dc.predictions <- as.numeric(dc.predictions) pred_obj <- prediction(dc.predictions, test$good) auc_val <- performance(pred_obj, "auc")@y.values[[1]] auc_val roc_obj <- performance(pred_obj, "tpr", "fpr") plot(roc_obj, colorize = TRUE, lwd = 2, xlab = "False Positive Rate", ylab = "True Positive Rate", main = "CART (10-fold CV)") abline(a = 0, b = 1) x_values <- as.numeric(unlist(roc_obj@x.values)) y_values <- as.numeric(unlist(roc_obj@y.values)) polygon(x = x_values, y = y_values, col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3), border = NA) polygon(x = c(0, 1, 1), y = c(0, 0, 1), col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3), border = NA) text(0.6, 0.4, paste("AUC =", round(auc_val, 4))) dc.kfoldCV.ROC.plot <- recordPlot() dc_df <- data.frame(k= dc_model$results$maxdepth, Accuracy=dc_model$results$Accuracy, Kappa=dc_model$results$Kappa) dc.kfoldCV.plot <- accu.kappa.plot(dc_df) + geom_text(aes(x = k, y = Accuracy, label = round(Accuracy, 3)), hjust = -0.3, angle=90) + geom_text(aes(x = k, y = Kappa, label = round(Kappa, 3)), hjust = -0.3, angle=90) + labs(x="Max Depth") ggtitle("CART Model Performance") pander::pander(dc_model$results) ``` ### Tuned ```{r, dc.kfoldCV_mod, fig.show='hide'} # Model Import load("dataset\\model\\dc.model_kfoldCV_mod.Rdata") dc.predictions <- predict(dc_model, newdata = test) confusionMatrix(dc.predictions, test$good) dc.predictions <- as.numeric(dc.predictions) pred_obj <- prediction(dc.predictions, test$good) auc_val <- performance(pred_obj, "auc")@y.values[[1]] auc_val roc_obj <- performance(pred_obj, "tpr", "fpr") plot(roc_obj, colorize = TRUE, lwd = 2, xlab = "False Positive Rate", ylab = "True Positive Rate", main = "CART Tuned (10-fold CV)") abline(a = 0, b = 1) x_values <- as.numeric(unlist(roc_obj@x.values)) y_values <- as.numeric(unlist(roc_obj@y.values)) polygon(x = x_values, y = y_values, col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3), border = NA) polygon(x = c(0, 1, 1), y = c(0, 0, 1), col = rgb(0.3803922, 0.6862745, 0.9372549, alpha = 0.3), border = NA) text(0.6, 0.4, paste("AUC =", round(auc_val, 4))) dc.kfoldCV_mod.ROC.plot <- recordPlot() pander::pander(dc_model$results) ``` ## Summary ```{r fig.width=20, fig.height=10} cowplot::plot_grid(dc.kfoldCV.ROC.plot, dc.kfoldCV_mod.ROC.plot, ncol = 2, align = "hv", scale = 0.8) ``` | Model | Error Rate | Sensitivity | Specificity | AUC | | -------------------- | ---------- | ----------- | ----------- | --------- | | CART | 0.1827 | 0.9062 | 0.4644 | 0.6853261 | | CART (Tuned) | 0.1810 | 0.9115 | 0.4519 | 0.6816843 | ```{r, echo=FALSE} save(dc.kfoldCV.ROC.plot, file = "dataset\\plot\\dc.kfoldCV.ROC.plot.Rdata") save(dc.kfoldCV_mod.ROC.plot, file = "dataset\\plot\\dc.kfoldCV_mod.ROC.plot.Rdata") ```