spambase <- read.csv(file = "../data/spambase.csv", header = TRUE, sep = ",")
table(spambase$class)
0 1
2788 1813
Two types of classifiers
Evaluation metric: accuracy
set.seed(123)
library(caret)
Loading required package: ggplot2
Loading required package: lattice
trctrl <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
nb_fit <- train(factor(class) ~ ., data = spambase, method = "naive_bayes", trControl = trctrl, tuneLength = 0)
nb_fit
Naive Bayes
4601 samples
57 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 4141, 4140, 4141, 4141, 4140, 4141, ...
Resampling results across tuning parameters:
usekernel Accuracy Kappa
FALSE 0.7150577 0.4623951
TRUE 0.5833487 0.2622482
Tuning parameter 'laplace' was held constant at a value of 0
Tuning parameter 'adjust' was
held constant at a value of 1
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were laplace = 0, usekernel = FALSE and adjust = 1.
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages -------------------------------------------------------- tidyverse 1.3.1 --
v tibble 3.1.6 v dplyr 1.0.7
v tidyr 1.1.4 v stringr 1.4.0
v readr 2.1.1 v forcats 0.5.1
v purrr 0.3.4
-- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x purrr::lift() masks caret::lift()
pred <- nb_fit$pred
pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
eachfold <- pred %>%
group_by(Resample) %>%
summarise_at(
vars(equal),
list(Accuracy = mean)
)
eachfold
ggplot(data=eachfold, aes(x=Resample, y=Accuracy, group=1)) +
geom_boxplot(color="maroon") +
geom_point() +
theme_minimal()
library(caret)
trctrl <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
nb_fit <- train(factor(class) ~ ., data = spambase, method = "knn", trControl = trctrl, tuneLength = 0)
nb_fit
k-Nearest Neighbors
4601 samples
57 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 4141, 4141, 4142, 4141, 4140, 4141, ...
Resampling results:
Accuracy Kappa
0.8033096 0.586366
Tuning parameter 'k' was held constant at a value of 5
library(tidyverse)
pred <- nb_fit$pred
pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
eachfold2 <- pred %>%
group_by(Resample) %>%
summarise_at(
vars(equal),
list(Accuracy = mean)
)
eachfold2
ggplot(data=eachfold2, aes(x=Resample, y=Accuracy, group=1)) +
geom_boxplot(color="maroon") +
geom_point() +
theme_minimal()
library(nortest)
library(EnvStats)
Attaching package: ‘EnvStats’
The following objects are masked from ‘package:stats’:
predict, predict.lm
The following object is masked from ‘package:base’:
print.default
lillie.test(eachfold$Accuracy)
Lilliefors (Kolmogorov-Smirnov) normality test
data: eachfold$Accuracy
D = 0.14472, p-value = 0.7999
hist(eachfold$Accuracy, main = "Main", xlab = "value", border = "light blue", col = "blue", las = 1)
qqPlot(eachfold$Accuracy, y = NULL, distribution = "norm", param.list = list(mean = mean(eachfold$Accuracy), sd = sd(eachfold$Accuracy)))
library(nortest)
library(EnvStats)
lillie.test(eachfold2$Accuracy)
Lilliefors (Kolmogorov-Smirnov) normality test
data: eachfold2$Accuracy
D = 0.14557, p-value = 0.793
hist(eachfold2$Accuracy, main = "Main", xlab = "value", border = "light blue", col = "blue", las = 1)
qqPlot(eachfold2$Accuracy, y = NULL, distribution = "norm", param.list = list(mean = mean(eachfold2$Accuracy), sd = sd(eachfold2$Accuracy)))
Paired or non paired?
Risk/Threshold: p < 0.05
Non-paired
library(stats)
wilcox.test(eachfold$Accuracy, eachfold2$Accuracy, alternative = "two.sided", paired = FALSE, exact = FALSE, correct = TRUE)
Wilcoxon rank sum test with continuity correction
data: eachfold$Accuracy and eachfold2$Accuracy
W = 0, p-value = 0.0001827
alternative hypothesis: true location shift is not equal to 0