Exploratory Analysis

Data Description

This is a dataset of wine quality containing 4898 observations of 12 variables. The variables are:

fixed.acidity: The amount of fixed acid in the wine (\(g/dm^3\))
volatile.acidity: The amount of volatile acid in the wine (\(g/dm^4\))
citric.acid: The amount of citric acid in the wine (\(g/dm^3\))
residual.sugar: The amount of residual sugar in the wine (\(g/dm^3\))
chlorides: The amount of salt in the wine (\(g/dm^3\))
free.sulfur.dioxide: The amount of free sulfur dioxide in the wine (\(mg/dm^3\))
total.sulfur.dioxide: The amount of total sulfur dioxide in the wine (\(mg/dm^3\))
density: The density of the wine (\(g/dm^3\))
pH: The \(pH\) value of the wine
sulphates: The amount of sulphates in the wine (\(g/dm^3\))
alcohol: The alcohol content of the wine (\(\% vol\))
quality: The quality score of the wine (0-10)

After removing the duplicate rows from our data set, we are left with 3961 observations of the above 11 variables minus quality column variable, and introduced a new variable good as our response:

good: A binary variable indicating whether the wine is good (quality \(\geq\) 7) or not (quality \(<\) 7).

Data Import

Show/Hide Code

# Import original dataset
wine.data <- read.csv("dataset\\winequality-white.csv", sep=";", header = T)

str(wine.data)

'data.frame':   4898 obs. of  12 variables:
 $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
 $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
 $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
 $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
 $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
 $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
 $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
 $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
 $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
 $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
 $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
 $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...

Show/Hide Code

# Removing duplicate Rows, mutate our categorical response good
wine.data_cleaned <-  wine.data %>% mutate(good = ifelse(quality>=7, 1, 0)) %>% distinct() %>% dplyr::select(-quality)

str(wine.data_cleaned)

'data.frame':   3961 obs. of  12 variables:
 $ fixed.acidity       : num  7 6.3 8.1 7.2 6.2 8.1 8.1 8.6 7.9 6.6 ...
 $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.32 0.22 0.27 0.23 0.18 0.16 ...
 $ citric.acid         : num  0.36 0.34 0.4 0.32 0.16 0.43 0.41 0.4 0.37 0.4 ...
 $ residual.sugar      : num  20.7 1.6 6.9 8.5 7 1.5 1.45 4.2 1.2 1.5 ...
 $ chlorides           : num  0.045 0.049 0.05 0.058 0.045 0.044 0.033 0.035 0.04 0.044 ...
 $ free.sulfur.dioxide : num  45 14 30 47 30 28 11 17 16 48 ...
 $ total.sulfur.dioxide: num  170 132 97 186 136 129 63 109 75 143 ...
 $ density             : num  1.001 0.994 0.995 0.996 0.995 ...
 $ pH                  : num  3 3.3 3.26 3.19 3.18 3.22 2.99 3.14 3.18 3.54 ...
 $ sulphates           : num  0.45 0.49 0.44 0.4 0.47 0.45 0.56 0.53 0.63 0.52 ...
 $ alcohol             : num  8.8 9.5 10.1 9.9 9.6 11 12 9.7 10.8 12.4 ...
 $ good                : num  0 0 0 0 0 0 0 0 0 1 ...

Data Analysis

Show/Hide Code

dim(wine.data)

[1] 4898   12

Show/Hide Code

dim(wine.data_cleaned)

[1] 3961   12

Show/Hide Code

summary(wine.data)

 fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
 Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
 1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
 Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
 Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
 3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
 Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
   chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
 Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
 1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
 Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
 Mean   :0.04577   Mean   : 35.31      Mean   :138.4        Mean   :0.9940  
 3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
 Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
       pH          sulphates         alcohol         quality     
 Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
 1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
 Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
 Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
 3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
 Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000

Show/Hide Code

summary(wine.data_cleaned)

 fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
 Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
 1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.600  
 Median : 6.800   Median :0.2600   Median :0.3200   Median : 4.700  
 Mean   : 6.839   Mean   :0.2805   Mean   :0.3343   Mean   : 5.915  
 3rd Qu.: 7.300   3rd Qu.:0.3300   3rd Qu.:0.3900   3rd Qu.: 8.900  
 Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
   chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
 Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
 1st Qu.:0.03500   1st Qu.: 23.00      1st Qu.:106.0        1st Qu.:0.9916  
 Median :0.04200   Median : 33.00      Median :133.0        Median :0.9935  
 Mean   :0.04591   Mean   : 34.89      Mean   :137.2        Mean   :0.9938  
 3rd Qu.:0.05000   3rd Qu.: 45.00      3rd Qu.:166.0        3rd Qu.:0.9957  
 Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
       pH          sulphates         alcohol           good       
 Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :0.0000  
 1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:0.0000  
 Median :3.180   Median :0.4800   Median :10.40   Median :0.0000  
 Mean   :3.195   Mean   :0.4904   Mean   :10.59   Mean   :0.2083  
 3rd Qu.:3.290   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:0.0000  
 Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :1.0000

Show/Hide Code

# Check for NAs in dataset
sum(is.na(wine.data))

[1] 0

Show/Hide Code

# Counts for response's at each factor level
table(wine.data$quality)


   3    4    5    6    7    8    9 
  20  163 1457 2198  880  175    5

Data Distribution

Show/Hide Code

wine.colnames <- colnames(wine.data)
num_plots     <- length(wine.colnames)
num_rows      <- ceiling(num_plots/3)

# Create an empty list to store plots
grid_arr      <- list()

# Loop over each column name in the wine.colnames vector
for(i in 1:num_plots) {
  # Create a ggplot object for the current column using aes
  plt <- ggplot(data = wine.data, aes_string(x = wine.colnames[i])) +
    geom_histogram(binwidth = diff(range(wine.data[[wine.colnames[i]]]))/30, 
                   color = "black", fill = "slategray3") +
    labs(x = wine.colnames[i], y = "Frequency") +
    theme_bw()
  
  # Add the current plot to the grid_arr list
  grid_arr[[i]] <- plt
}

grid_arr <- do.call(gridExtra::grid.arrange, c(grid_arr, ncol = 3))

Data Relationships

Show/Hide Code

reshape2::melt(wine.data[, 1:12], "quality") %>% 
  ggplot(aes(value, quality, color = variable)) +  
  geom_point() + 
  geom_smooth(aes(value, quality, colour=variable), method=lm, se=FALSE)+
  facet_wrap(.~variable, scales = "free")

Show/Hide Code

# Collinearity between Attributes
cor(wine.data) %>% 
  corrplot::corrplot(method = 'number',  type = "lower", tl.col = "steelblue", number.cex = 0.5)

Show/Hide Code

reshape2::melt(wine.data_cleaned[, 1:12], "good") %>% 
  ggplot(aes(value, good, color = variable)) +  
  geom_point() + 
  geom_smooth(aes(value, good, colour=variable), method=lm, se=FALSE)+
  facet_wrap(.~variable, scales = "free")

Show/Hide Code

# Collinearity between Attributes
cor(wine.data_cleaned) %>% 
  corrplot::corrplot(method = 'number',  type = "lower", tl.col = "steelblue", number.cex = 0.5)

Data Split

Show/Hide Code

set.seed(1234)
# Splitting the dataset into train and test (7/10th for train remaining for test)
inTrain <- caret::createDataPartition(wine.data_cleaned$good, p = 7/10, list = F)
train <- wine.data_cleaned[inTrain,]
test  <- wine.data_cleaned[-inTrain,]


# Convert the outcome variable to a factor with two levels
train$good <- as.factor(train$good)
test$good <- as.factor(test$good)

# Save data for building models in the next step
save(wine.data, file = "dataset\\wine.data.Rdata")
save(wine.data_cleaned, file = "dataset\\wine.data_cleaned.Rdata")
save(train, file = "dataset\\train.Rdata")
save(test, file = "dataset\\test.Rdata")