1 Implementando o Naive Bayes para variáveis categóricas

rm(list = objects())
library(lattice)
library(latticeExtra)
# Documentação dos dados.
# help(UCBAdmissions, help_type = "html")

# Carrega.
data(UCBAdmissions)

# Dados da forma de um array cúbico.
str(UCBAdmissions)
##  table [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
##  - attr(*, "dimnames")=List of 3
##   ..$ Admit : chr [1:2] "Admitted" "Rejected"
##   ..$ Gender: chr [1:2] "Male" "Female"
##   ..$ Dept  : chr [1:6] "A" "B" "C" "D" ...
# Gráfico de mosaico.
mosaicplot(UCBAdmissions)

# Frequencia cruzada.
addmargins(margin.table(UCBAdmissions, 1:2))
##           Gender
## Admit      Male Female  Sum
##   Admitted 1198    557 1755
##   Rejected 1493   1278 2771
##   Sum      2691   1835 4526
# Transforma em tabela.
da <- as.data.frame(UCBAdmissions)
da
##       Admit Gender Dept Freq
## 1  Admitted   Male    A  512
## 2  Rejected   Male    A  313
## 3  Admitted Female    A   89
## 4  Rejected Female    A   19
## 5  Admitted   Male    B  353
## 6  Rejected   Male    B  207
## 7  Admitted Female    B   17
## 8  Rejected Female    B    8
## 9  Admitted   Male    C  120
## 10 Rejected   Male    C  205
## 11 Admitted Female    C  202
## 12 Rejected Female    C  391
## 13 Admitted   Male    D  138
## 14 Rejected   Male    D  279
## 15 Admitted Female    D  131
## 16 Rejected Female    D  244
## 17 Admitted   Male    E   53
## 18 Rejected   Male    E  138
## 19 Admitted Female    E   94
## 20 Rejected Female    E  299
## 21 Admitted   Male    F   22
## 22 Rejected   Male    F  351
## 23 Admitted Female    F   24
## 24 Rejected Female    F  317
# Total de casos.
tot <- sum(da$Freq)
tot
## [1] 4526
# Divide nos níveis de Admit e calcula a marginal e todas as
# condicionais. Retorna as probabilidades.
probs <- by(data = da,
            INDICES = da$Admit,
            FUN = function(a_subset) {
                with(a_subset, {
                    a <- as.character(a_subset$Admit[1])
                    # Freq(A = a) e Prob(A = a)
                    f_a <- sum(Freq)
                    p_a <- f_a/tot
                    # Freq(g | A = a) e Prob(g | A = a)
                    f_g.a <- tapply(Freq, Gender, sum)
                    p_g.a <- f_g.a/f_a
                    # Freq(d | A = a) e Prob(g | A = a).
                    f_d.a <- tapply(Freq, Dept, sum)
                    p_d.a <- f_d.a/f_a
                    cat("------------------------------\n")
                    cat(sprintf("P(A = %s): %0.3f", a, p_a), "\n\n")
                    cat(sprintf("P(G = %s | A = %s): %0.3f",
                                names(p_g.a), a, p_g.a),
                        sep = "\n")
                    cat("\n")
                    cat(sprintf("P(D = %s | A = %s): %0.3f",
                                names(p_d.a), a, p_d.a),
                        sep = "\n")
                    cat("\n")
                    probs <- (p_a) * outer(p_g.a, p_d.a, FUN = "*")
                    probs <- plyr::adply(probs, seq_along(dim(probs)))
                    names(probs) <- c("Gender", "Dept", a)
                    return(probs)
                })
            })
## ------------------------------
## P(A = Admitted): 0.388 
## 
## P(G = Male | A = Admitted): 0.683
## P(G = Female | A = Admitted): 0.317
## 
## P(D = A | A = Admitted): 0.342
## P(D = B | A = Admitted): 0.211
## P(D = C | A = Admitted): 0.183
## P(D = D | A = Admitted): 0.153
## P(D = E | A = Admitted): 0.084
## P(D = F | A = Admitted): 0.026
## 
## ------------------------------
## P(A = Rejected): 0.612 
## 
## P(G = Male | A = Rejected): 0.539
## P(G = Female | A = Rejected): 0.461
## 
## P(D = A | A = Rejected): 0.120
## P(D = B | A = Rejected): 0.078
## P(D = C | A = Rejected): 0.215
## P(D = D | A = Rejected): 0.189
## P(D = E | A = Rejected): 0.158
## P(D = F | A = Rejected): 0.241
probs
## da$Admit: Admitted
##    Gender Dept    Admitted
## 1    Male    A 0.090644116
## 2  Female    A 0.042144218
## 3    Male    B 0.055804198
## 4  Female    B 0.025945691
## 5    Male    C 0.048564735
## 6  Female    C 0.022579764
## 7    Male    D 0.040571160
## 8  Female    D 0.018863219
## 9    Male    E 0.022170857
## 10 Female    E 0.010308153
## 11   Male    F 0.006937819
## 12 Female    F 0.003225681
## -------------------------------------------------------- 
## da$Admit: Rejected
##    Gender Dept   Rejected
## 1    Male    A 0.03952272
## 2  Female    A 0.03383124
## 3    Male    B 0.02559453
## 4  Female    B 0.02190878
## 5    Male    C 0.07095042
## 6  Female    C 0.06073318
## 7    Male    D 0.06226019
## 8  Female    D 0.05329439
## 9    Male    E 0.05202237
## 10 Female    E 0.04453087
## 11   Male    F 0.07952162
## 12 Female    F 0.06807008
# Fazendo a junção recursiva.
probs <- Reduce(merge, probs)
# probs

A <- levels(da$Admit)
probs$class <- A[apply(probs[, A],
                       MARGIN = 1,
                       FUN = which.max)]
probs
##    Gender Dept    Admitted   Rejected    class
## 1  Female    A 0.042144218 0.03383124 Admitted
## 2  Female    B 0.025945691 0.02190878 Admitted
## 3  Female    C 0.022579764 0.06073318 Rejected
## 4  Female    D 0.018863219 0.05329439 Rejected
## 5  Female    E 0.010308153 0.04453087 Rejected
## 6  Female    F 0.003225681 0.06807008 Rejected
## 7    Male    A 0.090644116 0.03952272 Admitted
## 8    Male    B 0.055804198 0.02559453 Admitted
## 9    Male    C 0.048564735 0.07095042 Rejected
## 10   Male    D 0.040571160 0.06226019 Rejected
## 11   Male    E 0.022170857 0.05202237 Rejected
## 12   Male    F 0.006937819 0.07952162 Rejected
#-----------------------------------------------------------------------
# Repetindo com os dados do HairEyeColor.

HairEyeColor
## , , Sex = Male
## 
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    32   11    10     3
##   Brown    53   50    25    15
##   Red      10   10     7     7
##   Blond     3   30     5     8
## 
## , , Sex = Female
## 
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    36    9     5     2
##   Brown    66   34    29    14
##   Red      16    7     7     7
##   Blond     4   64     5     8
# Transforma em tabela.
da <- as.data.frame(HairEyeColor)
da
##     Hair   Eye    Sex Freq
## 1  Black Brown   Male   32
## 2  Brown Brown   Male   53
## 3    Red Brown   Male   10
## 4  Blond Brown   Male    3
## 5  Black  Blue   Male   11
## 6  Brown  Blue   Male   50
## 7    Red  Blue   Male   10
## 8  Blond  Blue   Male   30
## 9  Black Hazel   Male   10
## 10 Brown Hazel   Male   25
## 11   Red Hazel   Male    7
## 12 Blond Hazel   Male    5
## 13 Black Green   Male    3
## 14 Brown Green   Male   15
## 15   Red Green   Male    7
## 16 Blond Green   Male    8
## 17 Black Brown Female   36
## 18 Brown Brown Female   66
## 19   Red Brown Female   16
## 20 Blond Brown Female    4
## 21 Black  Blue Female    9
## 22 Brown  Blue Female   34
## 23   Red  Blue Female    7
## 24 Blond  Blue Female   64
## 25 Black Hazel Female    5
## 26 Brown Hazel Female   29
## 27   Red Hazel Female    7
## 28 Blond Hazel Female    5
## 29 Black Green Female    2
## 30 Brown Green Female   14
## 31   Red Green Female    7
## 32 Blond Green Female    8
# Total de casos.
tot <- sum(da$Freq)
tot
## [1] 592
# Divide nos níveis de Eye e calcula a marginal e todas as
# condicionais. Retorna as probabilidades.
probs <- by(data = da,
            INDICES = da$Eye,
            FUN = function(a_subset) {
                with(a_subset, {
                    a <- as.character(a_subset$Eye[1])
                    # Freq(A = a) e Prob(A = a)
                    f_a <- sum(Freq)
                    p_a <- f_a/tot
                    # Freq(g | A = a) e Prob(g | A = a)
                    f_g.a <- tapply(Freq, Sex, sum)
                    p_g.a <- f_g.a/f_a
                    # Freq(d | A = a) e Prob(g | A = a).
                    f_d.a <- tapply(Freq, Hair, sum)
                    p_d.a <- f_d.a/f_a
                    cat("------------------------------\n")
                    cat(sprintf("P(%s): %0.3f", a, p_a), "\n\n")
                    cat(sprintf("P(%s | %s): %0.3f",
                                names(p_g.a), a, p_g.a),
                        sep = "\n")
                    cat("\n")
                    cat(sprintf("P(%s | %s): %0.3f",
                                names(p_d.a), a, p_d.a),
                        sep = "\n")
                    cat("\n")
                    probs <- (p_a) * outer(p_g.a, p_d.a, FUN = "*")
                    probs <- plyr::adply(probs, seq_along(dim(probs)))
                    names(probs) <- c("Sex", "Hair", a)
                    return(probs)
                })
            })
## ------------------------------
## P(Brown): 0.372 
## 
## P(Male | Brown): 0.445
## P(Female | Brown): 0.555
## 
## P(Black | Brown): 0.309
## P(Brown | Brown): 0.541
## P(Red | Brown): 0.118
## P(Blond | Brown): 0.032
## 
## ------------------------------
## P(Blue): 0.363 
## 
## P(Male | Blue): 0.470
## P(Female | Blue): 0.530
## 
## P(Black | Blue): 0.093
## P(Brown | Blue): 0.391
## P(Red | Blue): 0.079
## P(Blond | Blue): 0.437
## 
## ------------------------------
## P(Hazel): 0.157 
## 
## P(Male | Hazel): 0.505
## P(Female | Hazel): 0.495
## 
## P(Black | Hazel): 0.161
## P(Brown | Hazel): 0.581
## P(Red | Hazel): 0.151
## P(Blond | Hazel): 0.108
## 
## ------------------------------
## P(Green): 0.108 
## 
## P(Male | Green): 0.516
## P(Female | Green): 0.484
## 
## P(Black | Green): 0.078
## P(Brown | Green): 0.453
## P(Red | Green): 0.219
## P(Blond | Green): 0.250
# probs

# Fazendo a junção recursiva.
probs <- Reduce(merge, probs)
# probs

A <- levels(da$Eye)
probs$class <- A[apply(probs[, A],
                       MARGIN = 1,
                       FUN = which.max)]
probs
##      Sex  Hair       Brown       Blue       Hazel       Green class
## 1 Female Black 0.063697789 0.01791326 0.012532694 0.004091005 Brown
## 2 Female Blond 0.006557125 0.08419233 0.008355129 0.013091216  Blue
## 3 Female Brown 0.111471130 0.07523570 0.045117698 0.023727829 Brown
## 4 Female   Red 0.024355037 0.01522627 0.011697181 0.011454814 Brown
## 5   Male Black 0.051167076 0.01587052 0.012805144 0.004354941 Brown
## 6   Male Blond 0.005267199 0.07459145 0.008536763 0.013935811  Blue
## 7   Male Brown 0.089542383 0.06665619 0.046098518 0.025258657 Brown
## 8   Male   Red 0.019563882 0.01348994 0.011951468 0.012193834 Brown

2 Usando o pacote e1071

library(e1071)

# help(naiveBayes, h = "html")

# Converte array para tabela.
hec <- as.data.frame(HairEyeColor)

# O dado está agregado. As linhas terão que ser repetidas baseado em
# Freq.
r <- rep(seq_len(nrow(hec)), hec$Freq)
hec$Freq <- NULL
hec <- hec[r, ]
str(hec)
## 'data.frame':    592 obs. of  3 variables:
##  $ Hair: Factor w/ 4 levels "Black","Brown",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Eye : Factor w/ 4 levels "Brown","Blue",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
# Faz o ajuste do modelo.
nb <- naiveBayes(Eye ~ Hair + Sex, data = hec)

# Classe e métodos.
class(nb)
## [1] "naiveBayes"
methods(class = class(nb))
## [1] predict print  
## see '?methods' for accessing help and source code
# Resultado.
nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     Brown      Blue     Hazel     Green 
## 0.3716216 0.3631757 0.1570946 0.1081081 
## 
## Conditional probabilities:
##        Hair
## Y            Black      Brown        Red      Blond
##   Brown 0.30909091 0.54090909 0.11818182 0.03181818
##   Blue  0.09302326 0.39069767 0.07906977 0.43720930
##   Hazel 0.16129032 0.58064516 0.15053763 0.10752688
##   Green 0.07812500 0.45312500 0.21875000 0.25000000
## 
##        Sex
## Y            Male    Female
##   Brown 0.4454545 0.5545455
##   Blue  0.4697674 0.5302326
##   Hazel 0.5053763 0.4946237
##   Green 0.5156250 0.4843750
# pred <- with(hec,
#              expand.grid(Sex = levels(Sex),
#                          Hair = levels(Hair),
#                          KEEP.OUT.ATTRS = FALSE))
pred <- probs[, c("Sex", "Hair")]

# As probabilidades para cada classe.
probs[, A]/rowSums(probs[, A])
##        Brown      Blue      Hazel      Green
## 1 0.64842420 0.1823516 0.12757903 0.04164519
## 2 0.05844359 0.7504054 0.07446918 0.11668187
## 3 0.43619684 0.2944043 0.17654972 0.09284919
## 4 0.38823137 0.2427143 0.18645887 0.18259542
## 5 0.60770172 0.1884912 0.15208428 0.05172281
## 6 0.05147206 0.7289217 0.08342285 0.13618337
## 7 0.39349647 0.2929225 0.20258120 0.11099986
## 8 0.34203113 0.2358418 0.20894493 0.21318218
predict(nb, newdata = pred, type = "raw")
##           Brown      Blue      Hazel      Green
## [1,] 0.64842420 0.1823516 0.12757903 0.04164519
## [2,] 0.05844359 0.7504054 0.07446918 0.11668187
## [3,] 0.43619684 0.2944043 0.17654972 0.09284919
## [4,] 0.38823137 0.2427143 0.18645887 0.18259542
## [5,] 0.60770172 0.1884912 0.15208428 0.05172281
## [6,] 0.05147206 0.7289217 0.08342285 0.13618337
## [7,] 0.39349647 0.2929225 0.20258120 0.11099986
## [8,] 0.34203113 0.2358418 0.20894493 0.21318218
# A classe predita.
pred$class <- predict(nb, newdata = pred, type = "class")
pred
##      Sex  Hair class
## 1 Female Black Brown
## 2 Female Blond  Blue
## 3 Female Brown Brown
## 4 Female   Red Brown
## 5   Male Black Brown
## 6   Male Blond  Blue
## 7   Male Brown Brown
## 8   Male   Red Brown

3 Classificação com preditoras numéricas

splom(~iris[1:4], groups = iris$Species)

# nb <- naiveBayes(iris[, 1:4], iris[, 5])
nb <- naiveBayes(Species ~ ., data = iris)
nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             Sepal.Length
## Y             [,1]      [,2]
##   setosa     5.006 0.3524897
##   versicolor 5.936 0.5161711
##   virginica  6.588 0.6358796
## 
##             Sepal.Width
## Y             [,1]      [,2]
##   setosa     3.428 0.3790644
##   versicolor 2.770 0.3137983
##   virginica  2.974 0.3224966
## 
##             Petal.Length
## Y             [,1]      [,2]
##   setosa     1.462 0.1736640
##   versicolor 4.260 0.4699110
##   virginica  5.552 0.5518947
## 
##             Petal.Width
## Y             [,1]      [,2]
##   setosa     0.246 0.1053856
##   versicolor 1.326 0.1977527
##   virginica  2.026 0.2746501
table(predict(nb, iris[, -5]), iris[, 5])
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          3        47
#-----------------------------------------------------------------------

# Determinar média e desvio-padrão amostral das preditoras para cada
# classe de iris.
meas <- c("mean", "sd")
res <- lapply(meas,
              FUN = function(m) {
                  a <- aggregate(as.matrix(iris[, 1:4]) ~ Species,
                                 data = iris,
                                 FUN = m)
                  a <- reshape2::melt(a, id.vars = "Species")
                  names(a)[ncol(a)] <- m
                  return(a)
              })
Reduce(merge, res)
##       Species     variable  mean        sd
## 1      setosa Petal.Length 1.462 0.1736640
## 2      setosa  Petal.Width 0.246 0.1053856
## 3      setosa Sepal.Length 5.006 0.3524897
## 4      setosa  Sepal.Width 3.428 0.3790644
## 5  versicolor Petal.Length 4.260 0.4699110
## 6  versicolor  Petal.Width 1.326 0.1977527
## 7  versicolor Sepal.Length 5.936 0.5161711
## 8  versicolor  Sepal.Width 2.770 0.3137983
## 9   virginica Petal.Length 5.552 0.5518947
## 10  virginica  Petal.Width 2.026 0.2746501
## 11  virginica Sepal.Length 6.588 0.6358796
## 12  virginica  Sepal.Width 2.974 0.3224966
f <- sprintf("~%s", paste(names(iris)[1:4], collapse = " + "))

densityplot(as.formula(f),
            outer = TRUE,
            data = iris,
            lty = 2,
            groups = Species,
            scales = "free") +
    glayer({
        mx <- mean(x)
        sdx <- sd(x)
        panel.mathdensity(dmath = dnorm,
                          n = 303,
                          col = col.line,
                          args = list(mean = mx,
                                      sd = sdx))
    })

4 Usando o pacote caret

TODO

25px