#---------------------------##----Model Construction-----##---------------------------#set.seed(1234)# Define the training control object for 10-fold cross-validationtrain_control <-trainControl(method ="cv", number =10)# Train the logistic regression model using 10-fold cross-validationset.seed(1234)logit_model <-train(good ~ ., data = train, method ="glm", family ="binomial",trControl = train_control)save(logit_model, file ="dataset\\logit.model_kfoldCV.Rdata")
Show/Hide Code
# Data Importload("dataset\\wine.data_cleaned.Rdata")load("dataset\\train.Rdata")load("dataset\\test.Rdata")# Function Importload("dataset\\function\\accu.kappa.plot.Rdata")# Model Importload("dataset\\model\\logit.model_kfoldCV.Rdata")logit.predictions <-predict(logit_model, newdata = test)confusionMatrix(logit.predictions, test$good)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 886 163
1 63 76
Accuracy : 0.8098
95% CI : (0.7863, 0.8317)
No Information Rate : 0.7988
P-Value [Acc > NIR] : 0.1832
Kappa : 0.2983
Mcnemar's Test P-Value : 4.537e-11
Sensitivity : 0.9336
Specificity : 0.3180
Pos Pred Value : 0.8446
Neg Pred Value : 0.5468
Prevalence : 0.7988
Detection Rate : 0.7458
Detection Prevalence : 0.8830
Balanced Accuracy : 0.6258
'Positive' Class : 0
# Make predictions on test data and construct a confusion matrixlogit.predictions <-predict(glm.fit, newdata = test,type ="response")logit.predictions <-factor(ifelse(logit.predictions >0.7, 1, 0),levels =c(0, 1))confusionMatrix(logit.predictions, test$good)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 939 218
1 10 21
Accuracy : 0.8081
95% CI : (0.7845, 0.8301)
No Information Rate : 0.7988
P-Value [Acc > NIR] : 0.2246
Kappa : 0.1147
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.98946
Specificity : 0.08787
Pos Pred Value : 0.81158
Neg Pred Value : 0.67742
Prevalence : 0.79882
Detection Rate : 0.79040
Detection Prevalence : 0.97391
Balanced Accuracy : 0.53866
'Positive' Class : 0
log.perf <-performance(pred_obj, "tpr", "fpr")plot(log.perf, colorize =TRUE, lwd =2,xlab ="False Positive Rate", ylab ="True Positive Rate",main ="caret::glm ROC Curves with stepAIC")abline(a =0, b =1)x_values <-as.numeric(unlist(log.perf@x.values))y_values <-as.numeric(unlist(log.perf@y.values))polygon(x = x_values, y = y_values, col =rgb(0.3803922, 0.6862745, 0.9372549, alpha =0.3),border =NA)polygon(x =c(0, 1, 1), y =c(0, 0, 1), col =rgb(0.3803922, 0.6862745, 0.9372549, alpha =0.3),border =NA)text(0.6, 0.4, paste("AUC =", round(auc_val, 4)))
Show/Hide Code
logit.kfoldCV_caret_tuned.ROC.plot <-recordPlot()
K-fold CV (MASS)
Show/Hide Code
# Set the number of foldsk <-10# Randomly assign each row in the data to a foldset.seed(1234) # for reproducibilityfold_indices <-sample(rep(1:k, length.out =nrow(wine.data_cleaned)))# Initialize an empty list to store the foldsfolds <-vector("list", k)# Assign each row to a foldfor (i in1:k) { folds[[i]] <-which(fold_indices == i)}#To store the error rate of each folderror_rate <-numeric(k)kappa <-numeric(k)confusion_matrices <-vector("list", k)# Loop through each foldfor (i in1:k) {# Extract the i-th fold as the testing set test_indices <-unlist(folds[[i]]) test <- wine.data_cleaned[test_indices, ] train <- wine.data_cleaned[-test_indices, ]# Fit the model on the training set logit_model <-glm(good ~ ., data = train, family = binomial)# Make predictions on the testing set and calculate the error rate log.pred <-predict(logit_model, newdata = test, type ="response") predicted_classes <-as.numeric(ifelse(log.pred >0.7, 1, 0))# Compute MAE error_rate[i] <-mean((predicted_classes>0.7) != test$good)# Compute confusion matrix test$good <-as.factor(test$good) predicted_classes <-factor(ifelse(log.pred >0.7, 1, 0), levels =c(0, 1)) confusion_matrices[[i]] <- caret::confusionMatrix(predicted_classes, test$good)# Compute Kappa value kappa[i] <- confusion_matrices[[i]]$overall[[2]]# Print the error rates for each foldcat(paste0("Fold ", i, ": ", "OER:", error_rate[i], "\n"))}
# Set the seed for reproducibilityset.seed(1234)# Proportion of data to use for trainingtrain_prop <-0.7# Split the data into training and testing setstrain_indices <-sample(seq_len(nrow(wine.data_cleaned)), size =round(train_prop *nrow(wine.data_cleaned)), replace =FALSE)train <- wine.data_cleaned[train_indices, ]test <- wine.data_cleaned[-train_indices, ]# Fit the model on the training setlogit_model <-glm(good ~ ., data = train, family = binomial)# Make predictions on the testing set and calculate the error ratelog.pred <-predict(logit_model, newdata = test, type ="response")predicted_classes <-as.numeric(ifelse(log.pred >0.7, 1, 0))# Compute error rateerror_rate <-mean((predicted_classes >0.7) != test$good)# Calculate the accuracy of the predictions on the testing settrain$good <-as.numeric(train$good)test$good <-as.factor(test$good)predicted_classes <-factor(ifelse(log.pred >0.7, 1, 0), levels =c(0, 1))confusionMatrix(predicted_classes, test$good)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 938 214
1 11 25
Accuracy : 0.8106
95% CI : (0.7871, 0.8325)
No Information Rate : 0.7988
P-Value [Acc > NIR] : 0.1643
Kappa : 0.1363
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9884
Specificity : 0.1046
Pos Pred Value : 0.8142
Neg Pred Value : 0.6944
Prevalence : 0.7988
Detection Rate : 0.7896
Detection Prevalence : 0.9697
Balanced Accuracy : 0.5465
'Positive' Class : 0