This notebook has examples of multiple One-Class Classification for outlier detection. All algorithms are tested with Spambase dataset and each algorithm is tested with an additional dataset.
# Package with benchmark datasets
library(mlbench)
# Package with IsolationForest implementation
library(solitude)
# Package with SVM implementation
library(e1071)
# Package with LOF implementation
library(DDoutlier)
# Package with autoencoder implementation
library(h2o)
spambase <- read.csv(file = "../data/spambase.csv", header = TRUE, sep = ",")
spambase$class <- as.factor(spambase$class)
spambaseNONSPAM <- spambase[spambase$class == 0, ]
spambaseNONSPAM <- spambaseNONSPAM[, -58]
print(spambaseNONSPAM)
# Empty tree structure
iso <- isolationForest$new()
# Learn the IsolationForest for our data
iso$fit(spambaseNONSPAM)
INFO [22:09:01.654] dataset has duplicated rows
INFO [22:09:01.762] Building Isolation Forest ...
INFO [22:09:03.892] done
INFO [22:09:03.924] Computing depth of terminal nodes ...
INFO [22:09:05.164] done
INFO [22:09:05.393] Completed growing isolation forest
p <- iso$predict(spambaseNONSPAM)
print(p)
# sort(p$anomaly_score)
plot(density(p$anomaly_score), main = "Anomaly Score Density")
# Based on the plot, decide the cut-off point (e.g > 0.63)
which(p$anomaly_score > 0.63)
[1] 152 762 818 884 1105 1162 1433 1460 1551
# Census data for 506 Boston houses
data("BostonHousing", package = "mlbench")
print(BostonHousing)
# Empty tree structure
iso <- isolationForest$new()
# Learn the IsolationForest for our data
iso$fit(BostonHousing)
INFO [22:09:07.862] Building Isolation Forest ...
INFO [22:09:08.038] done
INFO [22:09:08.067] Computing depth of terminal nodes ...
INFO [22:09:08.706] done
INFO [22:09:08.764] Completed growing isolation forest
p <- iso$predict(BostonHousing)
print(p)
# sort(p$anomaly_score)
plot(density(p$anomaly_score), main = "Anomaly Score Density")
# Based on the plot, decide the cut-off point (e.g > 0.63)
which(p$anomaly_score > 0.63)
[1] 164 198 205 254 255 256 283 284 287 354 355 356 373 491
# train a SVM one-classification model
model <- svm(spambaseNONSPAM, y = NULL, type = "one-classification")
summary(model)
Call:
svm.default(x = spambaseNONSPAM, y = NULL, type = "one-classification")
Parameters:
SVM-Type: one-classification
SVM-Kernel: radial
gamma: 0.01754386
nu: 0.5
Number of Support Vectors: 1395
Number of Classes: 1
# CAUTION: testing on the same training set
# TRUE values mean suspect outliers
pred <- predict(model, spambaseNONSPAM)
# which(pred == TRUE)
table(pred)
pred
FALSE TRUE
1413 1375
# Daily air quality measurements in New York, May to September 1973
data(airquality)
print(airquality)
# Daily air quality measurements in New York, May to September 1973
data(airquality)
# train a SVM one-classification model
model <- svm(airquality, y = NULL, type = "one-classification")
summary(model)
Call:
svm.default(x = airquality, y = NULL, type = "one-classification")
Parameters:
SVM-Type: one-classification
SVM-Kernel: radial
gamma: 0.1666667
nu: 0.5
Number of Support Vectors: 58
Number of Classes: 1
# CAUTION: testing on the same training set
# TRUE values mean suspect outliers
pred <- predict(model, airquality)
# which(pred == TRUE)
table(pred)
pred
FALSE TRUE
51 60
# calculate "outlierness" score, by LOF
outlierness <- LOF(dataset = spambaseNONSPAM, k = 5)
# assign an index to outlierness values
names(outlierness) <- seq_len(nrow(spambaseNONSPAM))
# sort(outlierness, decreasing = TRUE)
hist(outlierness)
which(outlierness > 20.0)
88 152 277 313 410 652 724 753 780 784 801 857 980 1217 1237
88 152 277 313 410 652 724 753 780 784 801 857 980 1217 1237
1387 1416 1426 1434 1591 1726 1790 1794 1799 1827 1832 1864 1867 1994 2022
1387 1416 1426 1434 1591 1726 1790 1794 1799 1827 1832 1864 1867 1994 2022
2025 2100 2492 2581 2583 2721 2725 2734
2025 2100 2492 2581 2583 2721 2725 2734
# 1860 daily Closing Prices of Major European Stock Indices
data("EuStockMarkets")
EuStockMarkets[sample(nrow(EuStockMarkets), 10), ]
DAX SMI CAC FTSE
[1,] 1729.96 1840.5 1963.3 2591.0
[2,] 1617.78 1761.9 1755.4 2348.0
[3,] 2182.47 3150.1 1852.6 3541.6
[4,] 1437.65 1878.4 1657.3 2541.2
[5,] 2449.09 3309.9 1974.5 3715.9
[6,] 3018.58 4209.1 2503.1 4228.4
[7,] 2764.00 3829.8 2229.1 3934.3
[8,] 5870.49 7816.9 4215.7 5877.4
[9,] 5774.38 8139.2 4095.0 5809.7
[10,] 1617.18 2247.5 1890.4 2846.9
# calculate "outlierness" score, by LOF
outlierness <- LOF(dataset = EuStockMarkets, k = 5)
# assign an index to outlierness values
names(outlierness) <- seq_len(nrow(EuStockMarkets))
# sort(outlierness, decreasing = TRUE)
hist(outlierness)
which(outlierness > 2.0)
36 37 331 332
36 37 331 332
h2o.init(port = 50001)
Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 1 hours 48 minutes
H2O cluster timezone: Europe/Madrid
H2O data parsing timezone: UTC
H2O cluster version: 3.34.0.3
H2O cluster version age: 1 month and 26 days
H2O cluster name: H2O_started_from_R_julet_nys902
H2O cluster total nodes: 1
H2O cluster total memory: 1.46 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 50001
H2O Connection proxy: NA
H2O Internal Security: FALSE
H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
R Version: R version 4.1.2 (2021-11-01)
spambase <- h2o.importFile(path = "../data/spambase.csv")
|
| | 0%
|
|======================= | 34%
|
|====================================================================| 100%
spambaseNONSPAM <- spambase[spambase$C58 == 0, ]
spambaseNONSPAM <- spambaseNONSPAM[, -58]
# learn autoencoder with 2 hidden layers of 10 units each
autoencoder_model <- h2o.deeplearning(
x = 1:57,
training_frame = spambaseNONSPAM,
autoencoder = TRUE,
hidden = c(10, 10),
epochs = 5
)
|
| | 0%
|
|====================================================================| 100%
# features in the autoencoder's first hidden layer
deep_features_layer1 <- h2o.deepfeatures(autoencoder_model, spambaseNONSPAM, layer = 1)
|
| | 0%
|
|====================================================================| 100%
# further supervised models can be trained with these features
head(deep_features_layer1)
# reconstruction error per sample ~ outlierness indicative
reconstruction_error <- h2o.anomaly(autoencoder_model, spambaseNONSPAM)
head(reconstruction_error)
reconstruction_error <- as.data.frame(reconstruction_error)
plot(sort(reconstruction_error$Reconstruction.MSE), main = "Reconstruction Error")
which(reconstruction_error > 0.02)
[1] 66 110 152 202 241 458 466 515 671 673 742 762 768 802
[15] 818 861 884 910 990 1005 1036 1103 1139 1141 1162 1163 1193 1220
[29] 1366 1413 1433 1434 1438 1460 1491 1500 1551 1596 1611 1625 1662 1678
[43] 1719 1723 1749 1750 1755 1953 1976 2010 2043 2067 2094 2100 2207 2208
[57] 2227 2453 2454 2455 2456 2457 2461 2463 2490 2496 2497 2500 2507 2566
[71] 2651
h2o.init(port = 50001)
Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 1 hours 49 minutes
H2O cluster timezone: Europe/Madrid
H2O data parsing timezone: UTC
H2O cluster version: 3.34.0.3
H2O cluster version age: 1 month and 26 days
H2O cluster name: H2O_started_from_R_julet_nys902
H2O cluster total nodes: 1
H2O cluster total memory: 1.46 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 50001
H2O Connection proxy: NA
H2O Internal Security: FALSE
H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
R Version: R version 4.1.2 (2021-11-01)
prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
prostate <- h2o.importFile(path = prostate_path)
|
| | 0%
|
|====================================================================| 100%
print(prostate)
[380 rows x 9 columns]
# learn autoencoder with 2 hidden layers of 10 units each
autoencoder_model <- h2o.deeplearning(
x = 3:9,
training_frame = prostate,
autoencoder = TRUE,
hidden = c(10, 10),
epochs = 5
)
|
| | 0%
|
|====================================================================| 100%
# features in the autoencoder's first hidden layer
deep_features_layer1 <- h2o.deepfeatures(autoencoder_model, prostate, layer = 1)
|
| | 0%
|
|====================================================================| 100%
# further supervised models can be trained with these features
head(deep_features_layer1)
# reconstruction error per sample ~ outlierness indicative
reconstruction_error <- h2o.anomaly(autoencoder_model, prostate)
head(reconstruction_error)
reconstruction_error <- as.data.frame(reconstruction_error)
plot(sort(reconstruction_error$Reconstruction.MSE), main = "Reconstruction Error")
which(reconstruction_error > 0.15)
[1] 10 19 20 38 46 52 88 115 139 297 303