Análise discriminante linear
# Pacotes.
library(lattice)
library(latticeExtra)
library(MASS)
library(ellipse)
library(mvtnorm)
#-----------------------------------------------------------------------
# Usando um par de preditoras para visualizar a fronteira.
# Obtendo a matriz de covariância residual.
X <- as.matrix(subset(iris, select = c(Sepal.Length, Sepal.Width)))
Xs <- by(X,
INDICES = iris$Species,
FUN = scale,
center = TRUE,
scale = FALSE)
X <- do.call(rbind, Xs)
Sigma <- var(X)
# Dados centrados removendo efeito de Species, escala preservada.
xyplot(X[, 2] ~ X[, 1],
groups = iris$Species,
aspect = "iso") +
layer(panel.ellipse(..., groups = NULL, col = 1))

# Elipses para cada espécie e comum.
xyplot(Sepal.Width ~ Sepal.Length,
groups = Species,
aspect = "iso",
data = iris) +
layer(panel.ellipse(...)) +
glayer({
ell <- ellipse(Sigma,
level = 0.68,
centre = c(mean(x), mean(y)))
print(head(ell))
panel.lines(ell, col = col.line, lty = 2)
})

## Sepal.Length Sepal.Width
## [1,] 5.681179 3.873522
## [2,] 5.656093 3.888281
## [3,] 5.628390 3.901187
## [4,] 5.598180 3.912187
## [5,] 5.565586 3.921238
## [6,] 5.530739 3.928303
## Sepal.Length Sepal.Width
## [1,] 6.611179 3.215522
## [2,] 6.586093 3.230281
## [3,] 6.558390 3.243187
## [4,] 6.528180 3.254187
## [5,] 6.495586 3.263238
## [6,] 6.460739 3.270303
## Sepal.Length Sepal.Width
## [1,] 7.263179 3.419522
## [2,] 7.238093 3.434281
## [3,] 7.210390 3.447187
## [4,] 7.180180 3.458187
## [5,] 7.147586 3.467238
## [6,] 7.112739 3.474303
# Ajuste do modelo.
fit <- lda(Species ~ Sepal.Length + Sepal.Width,
data = iris)
fit
## Call:
## lda(Species ~ Sepal.Length + Sepal.Width, data = iris)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Sepal.Length Sepal.Width
## setosa 5.006 3.428
## versicolor 5.936 2.770
## virginica 6.588 2.974
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length -2.141178 -0.8152721
## Sepal.Width 2.768109 -2.0960764
##
## Proportion of trace:
## LD1 LD2
## 0.9628 0.0372
# Para apresentar a fronteira.
grid <- with(iris,
expand.grid(
Sepal.Length = seq(min(Sepal.Length),
max(Sepal.Length),
length.out = 41),
Sepal.Width = seq(min(Sepal.Width),
max(Sepal.Width),
length.out = 41),
KEEP.OUT.ATTRS = FALSE))
grid$pred <- predict(fit, newdata = grid)$class
str(grid)
## 'data.frame': 1681 obs. of 3 variables:
## $ Sepal.Length: num 4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
## $ Sepal.Width : num 2 2 2 2 2 2 2 2 2 2 ...
## $ pred : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
# Gráfico com pontos, classficações, fronteira e elipses de confiança.
xyplot(Sepal.Width ~ Sepal.Length,
data = iris,
groups = Species,
pch = 19) +
latticeExtra::layer(panel.xyplot(x = Sepal.Length,
y = Sepal.Width,
groups = pred,
subscripts = seq_along(pred),
pch = 4),
data = grid) +
latticeExtra::glayer({
ell <- ellipse(Sigma,
level = 0.68,
centre = c(mean(x), mean(y)))
panel.lines(ell, col = 1)
}) +
latticeExtra::layer(panel.ellipse(..., lwd = 2))

# Avaliando a densidade das normais multivariadas.
dens <- by(iris[, 1:2],
iris$Species,
FUN = function(x) {
m <- colMeans(x)
p <- nrow(x)/nrow(iris)
p * dmvnorm(grid[, 1:2],
mean = m,
sigma = Sigma)
})
dens <- as.data.frame(do.call(cbind, dens[1:3]))
grid <- cbind(grid, dens)
str(grid)
## 'data.frame': 1681 obs. of 6 variables:
## $ Sepal.Length: num 4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
## $ Sepal.Width : num 2 2 2 2 2 2 2 2 2 2 ...
## $ pred : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ setosa : num 2.78e-05 2.21e-05 1.67e-05 1.22e-05 8.46e-06 ...
## $ versicolor : num 0.00171 0.00272 0.00416 0.00607 0.0085 ...
## $ virginica : num 1.35e-05 2.73e-05 5.25e-05 9.70e-05 1.72e-04 ...
wp <- wireframe(setosa + versicolor + virginica ~
Sepal.Length + Sepal.Width,
zlab = "Densidade",
data = grid,
par.settings = simpleTheme(alpha = 0.7))
wp

# Vendo de cima.
update(wp,
par.settings = simpleTheme(alpha = 1),
screen = list(x = 0, z = 0, y = 0))

# library(wzRfun)
# library(rpanel)
# rp.wire(wp)
# Predições.
yp <- predict(fit)$class
# Acurácia.
tb <- table(yp, iris$Species)
tb
##
## yp setosa versicolor virginica
## setosa 49 0 0
## versicolor 1 36 15
## virginica 0 14 35
## [1] 0.8
#-----------------------------------------------------------------------
# Usando todas as variáveis.
# Ajuste do modelo.
fit <- lda(Species ~ .,
data = iris)
fit
## Call:
## lda(Species ~ ., data = iris)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.006 3.428 1.462 0.246
## versicolor 5.936 2.770 4.260 1.326
## virginica 6.588 2.974 5.552 2.026
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 0.8293776 0.02410215
## Sepal.Width 1.5344731 2.16452123
## Petal.Length -2.2012117 -0.93192121
## Petal.Width -2.8104603 2.83918785
##
## Proportion of trace:
## LD1 LD2
## 0.9912 0.0088
# Predições.
yp <- predict(fit)$class
# Acurácia.
tb <- table(yp, iris$Species)
tb
##
## yp setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 1
## virginica 0 2 49
## [1] 0.98
Análise discriminante quadrática
# Ajuste do modelo.
fit <- qda(Species ~ Sepal.Length + Sepal.Width,
data = iris)
fit
## Call:
## qda(Species ~ Sepal.Length + Sepal.Width, data = iris)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Sepal.Length Sepal.Width
## setosa 5.006 3.428
## versicolor 5.936 2.770
## virginica 6.588 2.974
# Para apresentar a fronteira.
grid$pred <- predict(fit, newdata = grid)$class
str(grid)
## 'data.frame': 1681 obs. of 6 variables:
## $ Sepal.Length: num 4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
## $ Sepal.Width : num 2 2 2 2 2 2 2 2 2 2 ...
## $ pred : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ setosa : num 2.78e-05 2.21e-05 1.67e-05 1.22e-05 8.46e-06 ...
## $ versicolor : num 0.00171 0.00272 0.00416 0.00607 0.0085 ...
## $ virginica : num 1.35e-05 2.73e-05 5.25e-05 9.70e-05 1.72e-04 ...
# Gráfico com pontos, classficações, fronteira e elipses de confiança.
xyplot(Sepal.Width ~ Sepal.Length,
data = iris,
groups = Species,
pch = 19) +
layer(panel.xyplot(x = Sepal.Length,
y = Sepal.Width,
groups = pred,
subscripts = seq_along(pred),
pch = 4),
data = grid,
under = TRUE) +
glayer({
ell <- ellipse(Sigma,
level = 0.68,
centre = c(mean(x), mean(y)))
print(head(ell))
panel.lines(ell, col = 1)
}) +
layer(panel.ellipse(..., lwd = 2))

## Sepal.Length Sepal.Width
## [1,] 5.681179 3.873522
## [2,] 5.656093 3.888281
## [3,] 5.628390 3.901187
## [4,] 5.598180 3.912187
## [5,] 5.565586 3.921238
## [6,] 5.530739 3.928303
## Sepal.Length Sepal.Width
## [1,] 6.611179 3.215522
## [2,] 6.586093 3.230281
## [3,] 6.558390 3.243187
## [4,] 6.528180 3.254187
## [5,] 6.495586 3.263238
## [6,] 6.460739 3.270303
## Sepal.Length Sepal.Width
## [1,] 7.263179 3.419522
## [2,] 7.238093 3.434281
## [3,] 7.210390 3.447187
## [4,] 7.180180 3.458187
## [5,] 7.147586 3.467238
## [6,] 7.112739 3.474303
# Avaliando a densidade das normais multivariadas.
grid <- subset(grid, select = setdiff(names(grid), names(dens)))
dens <- by(iris[, 1:2],
iris$Species,
FUN = function(x) {
m <- colMeans(x)
p <- nrow(x)/nrow(iris)
p * dmvnorm(grid[, 1:2],
mean = m,
sigma = var(x))
})
dens <- as.data.frame(do.call(cbind, dens[1:3]))
grid <- cbind(grid, dens)
str(grid)
## 'data.frame': 1681 obs. of 6 variables:
## $ Sepal.Length: num 4.3 4.39 4.48 4.57 4.66 4.75 4.84 4.93 5.02 5.11 ...
## $ Sepal.Width : num 2 2 2 2 2 2 2 2 2 2 ...
## $ pred : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ setosa : num 2.43e-04 1.44e-04 7.36e-05 3.26e-05 1.25e-05 ...
## $ versicolor : num 0.00165 0.00255 0.00376 0.00532 0.00723 ...
## $ virginica : num 0.000136 0.0002 0.000286 0.000399 0.000543 ...
wp <- wireframe(setosa + versicolor + virginica ~
Sepal.Length + Sepal.Width,
zlab = "Densidade",
data = grid,
par.settings = simpleTheme(alpha = 0.5))
wp

# Vendo de cima.
update(wp,
par.settings = simpleTheme(alpha = 1),
screen = list(x = 0, z = 0, y = 0))

# Predições.
yp <- predict(fit)$class
# Acurácia.
tb <- table(yp, iris$Species)
tb
##
## yp setosa versicolor virginica
## setosa 49 0 0
## versicolor 1 37 16
## virginica 0 13 34
## [1] 0.8
#-----------------------------------------------------------------------
# Usando todas as variáveis.
# Ajuste do modelo.
fit <- qda(Species ~ .,
data = iris)
fit
## Call:
## qda(Species ~ ., data = iris)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.006 3.428 1.462 0.246
## versicolor 5.936 2.770 4.260 1.326
## virginica 6.588 2.974 5.552 2.026
# Predições.
yp <- predict(fit)$class
# Acurácia.
tb <- table(yp, iris$Species)
tb
##
## yp setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 1
## virginica 0 2 49
## [1] 0.98
Dados de uva
#-----------------------------------------------------------------------
# Dados hospedados na web.
url <- "http://www.leg.ufpr.br/~walmes/data/areafoliarUva.txt"
uva <- read.table(url, header = TRUE, sep = "\t",
stringsAsFactors = FALSE)
uva$cult <- factor(uva$cult)
uva$id <- NULL
# Comprimento da nervura lateral: média dos lados direito e esquerdo.
uva$nl <- with(uva, apply(cbind(nld, nle), 1, mean))
uva <- subset(uva, select = -c(nld, nle))
str(uva)
## 'data.frame': 300 obs. of 7 variables:
## $ cult: Factor w/ 3 levels "malbec","merlot",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ area: num 100.8 85.8 119.5 137 84.7 ...
## $ mc : num 12 11.5 12.5 15.5 10 12 15.5 17.5 13.5 13.3 ...
## $ nc : num 7.5 9 8.5 10 7 8.5 11 13 10 9.5 ...
## $ ml : num 12.8 10.5 13 14.4 11 12 14 14 12 15 ...
## $ cll : num 9.5 9.5 10.2 12 7.5 8.9 13.5 10.8 9.7 10.3 ...
## $ nl : num 6.95 7.75 8.8 9.5 6.75 ...
splom(uva[-(1:2)],
groups = uva$cult,
auto.key = TRUE,
cex = 0.2)

m1 <- lda(cult ~ ., data = uva)
m1
## Call:
## lda(cult ~ ., data = uva)
##
## Prior probabilities of groups:
## malbec merlot sauvignonblanc
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## area mc nc ml cll nl
## malbec 116.44748 12.941 9.410 12.263 10.483 8.3540
## merlot 144.40203 15.403 10.851 14.190 12.056 9.3560
## sauvignonblanc 99.61412 11.641 8.235 11.536 9.213 7.3405
##
## Coefficients of linear discriminants:
## LD1 LD2
## area 0.03488656 -0.006443026
## mc -0.77691545 -0.762481480
## nc 0.34141328 0.485986328
## ml -0.15452055 -0.485121884
## cll -0.14263063 0.068195514
## nl -0.01744802 1.593497512
##
## Proportion of trace:
## LD1 LD2
## 0.7243 0.2757
m2 <- qda(cult ~ ., data = uva)
m2
## Call:
## qda(cult ~ ., data = uva)
##
## Prior probabilities of groups:
## malbec merlot sauvignonblanc
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## area mc nc ml cll nl
## malbec 116.44748 12.941 9.410 12.263 10.483 8.3540
## merlot 144.40203 15.403 10.851 14.190 12.056 9.3560
## sauvignonblanc 99.61412 11.641 8.235 11.536 9.213 7.3405
y1 <- predict(m1)$class
y2 <- predict(m2)$class
tb1 <- table(y1, uva$cult)
tb1
##
## y1 malbec merlot sauvignonblanc
## malbec 50 18 20
## merlot 21 68 25
## sauvignonblanc 29 14 55
## [1] 0.5766667
tb2 <- table(y2, uva$cult)
tb2
##
## y2 malbec merlot sauvignonblanc
## malbec 46 14 13
## merlot 24 72 21
## sauvignonblanc 30 14 66
## [1] 0.6133333
Usando o pacote caret
Exemplo com o iris
#-----------------------------------------------------------------------
library(caret)
# Criando as partições de treino e validação.
set.seed(135)
intrain <- createDataPartition(y = iris$Species,
p = 0.75,
list = FALSE)
data_train <- iris[intrain, ]
data_test <- iris[-intrain, ]
# Parametriza a valiação cruzada.
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
fit1 <- train(Species ~ .,
data = data_train,
method = "lda",
trControl = trctrl)
fit1
## Linear Discriminant Analysis
##
## 114 samples
## 4 predictors
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 102, 102, 102, 102, 102, 104, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9767677 0.9649406
# Predição e matriz de confusão.
yp1 <- predict(fit1, newdata = data_test)
confusionMatrix(yp1, data_test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 11 0
## virginica 0 1 12
##
## Overall Statistics
##
## Accuracy : 0.9722
## 95% CI : (0.8547, 0.9993)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 4.864e-16
##
## Kappa : 0.9583
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9167 1.0000
## Specificity 1.0000 1.0000 0.9583
## Pos Pred Value 1.0000 1.0000 0.9231
## Neg Pred Value 1.0000 0.9600 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3056 0.3333
## Detection Prevalence 0.3333 0.3056 0.3611
## Balanced Accuracy 1.0000 0.9583 0.9792
fit2 <- train(Species ~ .,
data = data_train,
method = "qda",
trControl = trctrl)
fit2
## Quadratic Discriminant Analysis
##
## 114 samples
## 4 predictors
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 103, 102, 103, 103, 102, 102, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9741919 0.9611995
# Predição e matriz de confusão.
yp2 <- predict(fit2, newdata = data_test)
confusionMatrix(yp2, data_test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 11 0
## virginica 0 1 12
##
## Overall Statistics
##
## Accuracy : 0.9722
## 95% CI : (0.8547, 0.9993)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 4.864e-16
##
## Kappa : 0.9583
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9167 1.0000
## Specificity 1.0000 1.0000 0.9583
## Pos Pred Value 1.0000 1.0000 0.9231
## Neg Pred Value 1.0000 0.9600 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3056 0.3333
## Detection Prevalence 0.3333 0.3056 0.3611
## Balanced Accuracy 1.0000 0.9583 0.9792
Exemplo com o conjunto de dados de folhas de uva
#-----------------------------------------------------------------------
# Criando as partições de treino e validação.
set.seed(135)
intrain <- createDataPartition(y = uva$cult,
p = 0.75,
list = FALSE)
data_train <- uva[intrain, ]
data_test <- uva[-intrain, ]
fit1 <- train(cult ~ .,
data = data_train,
method = "lda",
trControl = trctrl)
fit1
## Linear Discriminant Analysis
##
## 225 samples
## 6 predictors
## 3 classes: 'malbec', 'merlot', 'sauvignonblanc'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 202, 204, 202, 203, 203, 203, ...
## Resampling results:
##
## Accuracy Kappa
## 0.5872585 0.3803179
# Predição e matriz de confusão.
yp1 <- predict(fit1, newdata = data_test)
confusionMatrix(yp1, data_test$cult)
## Confusion Matrix and Statistics
##
## Reference
## Prediction malbec merlot sauvignonblanc
## malbec 11 3 7
## merlot 5 16 10
## sauvignonblanc 9 6 8
##
## Overall Statistics
##
## Accuracy : 0.4667
## 95% CI : (0.3505, 0.5855)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 0.01126
##
## Kappa : 0.2
## Mcnemar's Test P-Value : 0.62588
##
## Statistics by Class:
##
## Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity 0.4400 0.6400 0.3200
## Specificity 0.8000 0.7000 0.7000
## Pos Pred Value 0.5238 0.5161 0.3478
## Neg Pred Value 0.7407 0.7955 0.6731
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.1467 0.2133 0.1067
## Detection Prevalence 0.2800 0.4133 0.3067
## Balanced Accuracy 0.6200 0.6700 0.5100
fit2 <- train(cult ~ .,
data = data_train,
method = "qda",
trControl = trctrl)
fit2
## Quadratic Discriminant Analysis
##
## 225 samples
## 6 predictors
## 3 classes: 'malbec', 'merlot', 'sauvignonblanc'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 203, 202, 203, 203, 203, 203, ...
## Resampling results:
##
## Accuracy Kappa
## 0.5963423 0.3939673
# Predição e matriz de confusão.
yp2 <- predict(fit2, newdata = data_test)
confusionMatrix(yp2, data_test$cult)
## Confusion Matrix and Statistics
##
## Reference
## Prediction malbec merlot sauvignonblanc
## malbec 9 4 7
## merlot 7 17 9
## sauvignonblanc 9 4 9
##
## Overall Statistics
##
## Accuracy : 0.4667
## 95% CI : (0.3505, 0.5855)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 0.01126
##
## Kappa : 0.2
## Mcnemar's Test P-Value : 0.39297
##
## Statistics by Class:
##
## Class: malbec Class: merlot Class: sauvignonblanc
## Sensitivity 0.3600 0.6800 0.3600
## Specificity 0.7800 0.6800 0.7400
## Pos Pred Value 0.4500 0.5152 0.4091
## Neg Pred Value 0.7091 0.8095 0.6981
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.1200 0.2267 0.1200
## Detection Prevalence 0.2667 0.4400 0.2933
## Balanced Accuracy 0.5700 0.6800 0.5500