Implementando o Naive Bayes para variáveis categóricas
rm(list = objects())
library(lattice)
library(latticeExtra)
# Documentação dos dados.
# help(UCBAdmissions, help_type = "html")
# Carrega.
data(UCBAdmissions)
# Dados da forma de um array cúbico.
str(UCBAdmissions)
## table [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
## - attr(*, "dimnames")=List of 3
## ..$ Admit : chr [1:2] "Admitted" "Rejected"
## ..$ Gender: chr [1:2] "Male" "Female"
## ..$ Dept : chr [1:6] "A" "B" "C" "D" ...
# Gráfico de mosaico.
mosaicplot(UCBAdmissions)

# Frequencia cruzada.
addmargins(margin.table(UCBAdmissions, 1:2))
## Gender
## Admit Male Female Sum
## Admitted 1198 557 1755
## Rejected 1493 1278 2771
## Sum 2691 1835 4526
# Transforma em tabela.
da <- as.data.frame(UCBAdmissions)
da
## Admit Gender Dept Freq
## 1 Admitted Male A 512
## 2 Rejected Male A 313
## 3 Admitted Female A 89
## 4 Rejected Female A 19
## 5 Admitted Male B 353
## 6 Rejected Male B 207
## 7 Admitted Female B 17
## 8 Rejected Female B 8
## 9 Admitted Male C 120
## 10 Rejected Male C 205
## 11 Admitted Female C 202
## 12 Rejected Female C 391
## 13 Admitted Male D 138
## 14 Rejected Male D 279
## 15 Admitted Female D 131
## 16 Rejected Female D 244
## 17 Admitted Male E 53
## 18 Rejected Male E 138
## 19 Admitted Female E 94
## 20 Rejected Female E 299
## 21 Admitted Male F 22
## 22 Rejected Male F 351
## 23 Admitted Female F 24
## 24 Rejected Female F 317
# Total de casos.
tot <- sum(da$Freq)
tot
## [1] 4526
# Divide nos níveis de Admit e calcula a marginal e todas as
# condicionais. Retorna as probabilidades.
probs <- by(data = da,
INDICES = da$Admit,
FUN = function(a_subset) {
with(a_subset, {
a <- as.character(a_subset$Admit[1])
# Freq(A = a) e Prob(A = a)
f_a <- sum(Freq)
p_a <- f_a/tot
# Freq(g | A = a) e Prob(g | A = a)
f_g.a <- tapply(Freq, Gender, sum)
p_g.a <- f_g.a/f_a
# Freq(d | A = a) e Prob(g | A = a).
f_d.a <- tapply(Freq, Dept, sum)
p_d.a <- f_d.a/f_a
cat("------------------------------\n")
cat(sprintf("P(A = %s): %0.3f", a, p_a), "\n\n")
cat(sprintf("P(G = %s | A = %s): %0.3f",
names(p_g.a), a, p_g.a),
sep = "\n")
cat("\n")
cat(sprintf("P(D = %s | A = %s): %0.3f",
names(p_d.a), a, p_d.a),
sep = "\n")
cat("\n")
probs <- (p_a) * outer(p_g.a, p_d.a, FUN = "*")
probs <- plyr::adply(probs, seq_along(dim(probs)))
names(probs) <- c("Gender", "Dept", a)
return(probs)
})
})
## ------------------------------
## P(A = Admitted): 0.388
##
## P(G = Male | A = Admitted): 0.683
## P(G = Female | A = Admitted): 0.317
##
## P(D = A | A = Admitted): 0.342
## P(D = B | A = Admitted): 0.211
## P(D = C | A = Admitted): 0.183
## P(D = D | A = Admitted): 0.153
## P(D = E | A = Admitted): 0.084
## P(D = F | A = Admitted): 0.026
##
## ------------------------------
## P(A = Rejected): 0.612
##
## P(G = Male | A = Rejected): 0.539
## P(G = Female | A = Rejected): 0.461
##
## P(D = A | A = Rejected): 0.120
## P(D = B | A = Rejected): 0.078
## P(D = C | A = Rejected): 0.215
## P(D = D | A = Rejected): 0.189
## P(D = E | A = Rejected): 0.158
## P(D = F | A = Rejected): 0.241
## da$Admit: Admitted
## Gender Dept Admitted
## 1 Male A 0.090644116
## 2 Female A 0.042144218
## 3 Male B 0.055804198
## 4 Female B 0.025945691
## 5 Male C 0.048564735
## 6 Female C 0.022579764
## 7 Male D 0.040571160
## 8 Female D 0.018863219
## 9 Male E 0.022170857
## 10 Female E 0.010308153
## 11 Male F 0.006937819
## 12 Female F 0.003225681
## --------------------------------------------------------
## da$Admit: Rejected
## Gender Dept Rejected
## 1 Male A 0.03952272
## 2 Female A 0.03383124
## 3 Male B 0.02559453
## 4 Female B 0.02190878
## 5 Male C 0.07095042
## 6 Female C 0.06073318
## 7 Male D 0.06226019
## 8 Female D 0.05329439
## 9 Male E 0.05202237
## 10 Female E 0.04453087
## 11 Male F 0.07952162
## 12 Female F 0.06807008
# Fazendo a junção recursiva.
probs <- Reduce(merge, probs)
# probs
A <- levels(da$Admit)
probs$class <- A[apply(probs[, A],
MARGIN = 1,
FUN = which.max)]
probs
## Gender Dept Admitted Rejected class
## 1 Female A 0.042144218 0.03383124 Admitted
## 2 Female B 0.025945691 0.02190878 Admitted
## 3 Female C 0.022579764 0.06073318 Rejected
## 4 Female D 0.018863219 0.05329439 Rejected
## 5 Female E 0.010308153 0.04453087 Rejected
## 6 Female F 0.003225681 0.06807008 Rejected
## 7 Male A 0.090644116 0.03952272 Admitted
## 8 Male B 0.055804198 0.02559453 Admitted
## 9 Male C 0.048564735 0.07095042 Rejected
## 10 Male D 0.040571160 0.06226019 Rejected
## 11 Male E 0.022170857 0.05202237 Rejected
## 12 Male F 0.006937819 0.07952162 Rejected
#-----------------------------------------------------------------------
# Repetindo com os dados do HairEyeColor.
HairEyeColor
## , , Sex = Male
##
## Eye
## Hair Brown Blue Hazel Green
## Black 32 11 10 3
## Brown 53 50 25 15
## Red 10 10 7 7
## Blond 3 30 5 8
##
## , , Sex = Female
##
## Eye
## Hair Brown Blue Hazel Green
## Black 36 9 5 2
## Brown 66 34 29 14
## Red 16 7 7 7
## Blond 4 64 5 8
# Transforma em tabela.
da <- as.data.frame(HairEyeColor)
da
## Hair Eye Sex Freq
## 1 Black Brown Male 32
## 2 Brown Brown Male 53
## 3 Red Brown Male 10
## 4 Blond Brown Male 3
## 5 Black Blue Male 11
## 6 Brown Blue Male 50
## 7 Red Blue Male 10
## 8 Blond Blue Male 30
## 9 Black Hazel Male 10
## 10 Brown Hazel Male 25
## 11 Red Hazel Male 7
## 12 Blond Hazel Male 5
## 13 Black Green Male 3
## 14 Brown Green Male 15
## 15 Red Green Male 7
## 16 Blond Green Male 8
## 17 Black Brown Female 36
## 18 Brown Brown Female 66
## 19 Red Brown Female 16
## 20 Blond Brown Female 4
## 21 Black Blue Female 9
## 22 Brown Blue Female 34
## 23 Red Blue Female 7
## 24 Blond Blue Female 64
## 25 Black Hazel Female 5
## 26 Brown Hazel Female 29
## 27 Red Hazel Female 7
## 28 Blond Hazel Female 5
## 29 Black Green Female 2
## 30 Brown Green Female 14
## 31 Red Green Female 7
## 32 Blond Green Female 8
# Total de casos.
tot <- sum(da$Freq)
tot
## [1] 592
# Divide nos níveis de Eye e calcula a marginal e todas as
# condicionais. Retorna as probabilidades.
probs <- by(data = da,
INDICES = da$Eye,
FUN = function(a_subset) {
with(a_subset, {
a <- as.character(a_subset$Eye[1])
# Freq(A = a) e Prob(A = a)
f_a <- sum(Freq)
p_a <- f_a/tot
# Freq(g | A = a) e Prob(g | A = a)
f_g.a <- tapply(Freq, Sex, sum)
p_g.a <- f_g.a/f_a
# Freq(d | A = a) e Prob(g | A = a).
f_d.a <- tapply(Freq, Hair, sum)
p_d.a <- f_d.a/f_a
cat("------------------------------\n")
cat(sprintf("P(%s): %0.3f", a, p_a), "\n\n")
cat(sprintf("P(%s | %s): %0.3f",
names(p_g.a), a, p_g.a),
sep = "\n")
cat("\n")
cat(sprintf("P(%s | %s): %0.3f",
names(p_d.a), a, p_d.a),
sep = "\n")
cat("\n")
probs <- (p_a) * outer(p_g.a, p_d.a, FUN = "*")
probs <- plyr::adply(probs, seq_along(dim(probs)))
names(probs) <- c("Sex", "Hair", a)
return(probs)
})
})
## ------------------------------
## P(Brown): 0.372
##
## P(Male | Brown): 0.445
## P(Female | Brown): 0.555
##
## P(Black | Brown): 0.309
## P(Brown | Brown): 0.541
## P(Red | Brown): 0.118
## P(Blond | Brown): 0.032
##
## ------------------------------
## P(Blue): 0.363
##
## P(Male | Blue): 0.470
## P(Female | Blue): 0.530
##
## P(Black | Blue): 0.093
## P(Brown | Blue): 0.391
## P(Red | Blue): 0.079
## P(Blond | Blue): 0.437
##
## ------------------------------
## P(Hazel): 0.157
##
## P(Male | Hazel): 0.505
## P(Female | Hazel): 0.495
##
## P(Black | Hazel): 0.161
## P(Brown | Hazel): 0.581
## P(Red | Hazel): 0.151
## P(Blond | Hazel): 0.108
##
## ------------------------------
## P(Green): 0.108
##
## P(Male | Green): 0.516
## P(Female | Green): 0.484
##
## P(Black | Green): 0.078
## P(Brown | Green): 0.453
## P(Red | Green): 0.219
## P(Blond | Green): 0.250
# probs
# Fazendo a junção recursiva.
probs <- Reduce(merge, probs)
# probs
A <- levels(da$Eye)
probs$class <- A[apply(probs[, A],
MARGIN = 1,
FUN = which.max)]
probs
## Sex Hair Brown Blue Hazel Green class
## 1 Female Black 0.063697789 0.01791326 0.012532694 0.004091005 Brown
## 2 Female Blond 0.006557125 0.08419233 0.008355129 0.013091216 Blue
## 3 Female Brown 0.111471130 0.07523570 0.045117698 0.023727829 Brown
## 4 Female Red 0.024355037 0.01522627 0.011697181 0.011454814 Brown
## 5 Male Black 0.051167076 0.01587052 0.012805144 0.004354941 Brown
## 6 Male Blond 0.005267199 0.07459145 0.008536763 0.013935811 Blue
## 7 Male Brown 0.089542383 0.06665619 0.046098518 0.025258657 Brown
## 8 Male Red 0.019563882 0.01348994 0.011951468 0.012193834 Brown
Usando o pacote e1071
library(e1071)
# help(naiveBayes, h = "html")
# Converte array para tabela.
hec <- as.data.frame(HairEyeColor)
# O dado está agregado. As linhas terão que ser repetidas baseado em
# Freq.
r <- rep(seq_len(nrow(hec)), hec$Freq)
hec$Freq <- NULL
hec <- hec[r, ]
str(hec)
## 'data.frame': 592 obs. of 3 variables:
## $ Hair: Factor w/ 4 levels "Black","Brown",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Eye : Factor w/ 4 levels "Brown","Blue",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
# Faz o ajuste do modelo.
nb <- naiveBayes(Eye ~ Hair + Sex, data = hec)
# Classe e métodos.
class(nb)
## [1] "naiveBayes"
methods(class = class(nb))
## [1] predict print
## see '?methods' for accessing help and source code
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Brown Blue Hazel Green
## 0.3716216 0.3631757 0.1570946 0.1081081
##
## Conditional probabilities:
## Hair
## Y Black Brown Red Blond
## Brown 0.30909091 0.54090909 0.11818182 0.03181818
## Blue 0.09302326 0.39069767 0.07906977 0.43720930
## Hazel 0.16129032 0.58064516 0.15053763 0.10752688
## Green 0.07812500 0.45312500 0.21875000 0.25000000
##
## Sex
## Y Male Female
## Brown 0.4454545 0.5545455
## Blue 0.4697674 0.5302326
## Hazel 0.5053763 0.4946237
## Green 0.5156250 0.4843750
# pred <- with(hec,
# expand.grid(Sex = levels(Sex),
# Hair = levels(Hair),
# KEEP.OUT.ATTRS = FALSE))
pred <- probs[, c("Sex", "Hair")]
# As probabilidades para cada classe.
probs[, A]/rowSums(probs[, A])
## Brown Blue Hazel Green
## 1 0.64842420 0.1823516 0.12757903 0.04164519
## 2 0.05844359 0.7504054 0.07446918 0.11668187
## 3 0.43619684 0.2944043 0.17654972 0.09284919
## 4 0.38823137 0.2427143 0.18645887 0.18259542
## 5 0.60770172 0.1884912 0.15208428 0.05172281
## 6 0.05147206 0.7289217 0.08342285 0.13618337
## 7 0.39349647 0.2929225 0.20258120 0.11099986
## 8 0.34203113 0.2358418 0.20894493 0.21318218
predict(nb, newdata = pred, type = "raw")
## Brown Blue Hazel Green
## [1,] 0.64842420 0.1823516 0.12757903 0.04164519
## [2,] 0.05844359 0.7504054 0.07446918 0.11668187
## [3,] 0.43619684 0.2944043 0.17654972 0.09284919
## [4,] 0.38823137 0.2427143 0.18645887 0.18259542
## [5,] 0.60770172 0.1884912 0.15208428 0.05172281
## [6,] 0.05147206 0.7289217 0.08342285 0.13618337
## [7,] 0.39349647 0.2929225 0.20258120 0.11099986
## [8,] 0.34203113 0.2358418 0.20894493 0.21318218
# A classe predita.
pred$class <- predict(nb, newdata = pred, type = "class")
pred
## Sex Hair class
## 1 Female Black Brown
## 2 Female Blond Blue
## 3 Female Brown Brown
## 4 Female Red Brown
## 5 Male Black Brown
## 6 Male Blond Blue
## 7 Male Brown Brown
## 8 Male Red Brown