Random Forest

library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

data(iris)

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Explicando a escolha das variáveis utilizadas.

library(randomForest)
data(iris)
set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
proximity=TRUE)
print(iris.rf)

## 
## Call:
##  randomForest(formula = Species ~ ., data = iris, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.33%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         46         4        0.08
## virginica       0          4        46        0.08

## importancia das variaveis:
round(importance(iris.rf), 2)

##              setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length   6.04       7.85      7.93                11.51
## Sepal.Width    4.40       1.03      5.44                 5.40
## Petal.Length  21.76      31.33     29.64                32.94
## Petal.Width   22.84      32.67     31.68                34.50
##              MeanDecreaseGini
## Sepal.Length             8.77
## Sepal.Width              2.19
## Petal.Length            42.54
## Petal.Width             45.77

##Gráfico:
iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE)
op <- par(pty="s")
pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0,
col=c("red", "green", "blue")[as.numeric(iris$Species)],
main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")

par(op)

Etapa utilizando a base de treinamento:

forestIris <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris, 
    prox = TRUE)
forestIris

## 
## Call:
##  randomForest(formula = Species ~ Petal.Width + Petal.Length,      data = iris, prox = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 3.33%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         47         3        0.06
## virginica       0          2        48        0.04

Recuperando uma das árvores geradas:

getTree(forestIris, k = 2)

##   left daughter right daughter split var split point status prediction
## 1             2              3         1        1.55      1          0
## 2             4              5         2        2.45      1          0
## 3             0              0         0        0.00     -1          3
## 4             0              0         0        0.00     -1          1
## 5             6              7         2        4.95      1          0
## 6             0              0         0        0.00     -1          2
## 7             0              0         0        0.00     -1          3

Matriz de confusão:

table(predict(forestIris), iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         2
##   virginica       0          3        48

prop.table(table(predict(forestIris), iris$Species))

##             
##                  setosa versicolor  virginica
##   setosa     0.33333333 0.00000000 0.00000000
##   versicolor 0.00000000 0.31333333 0.01333333
##   virginica  0.00000000 0.02000000 0.32000000

Identificação dos centros das classes:

iris.p <- classCenter(iris[, c(3, 4)], iris$Species, forestIris$prox)
plot(iris[, 3], iris[, 4], pch = 21, xlab = names(iris)[3], ylab = names(iris)[4], 
    bg = c("red", "blue", "green")[as.numeric(factor(iris$Species))], main = "Dados sobre as plantas Iris com os centros das classes")
points(iris.p[, 1], iris.p[, 2], pch = 21, cex = 2, bg = c("red", "blue", "green"))

Combinando alguns modelos:

forestIris1 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris, 
    prox = TRUE, ntree = 50)
forestIris2 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris, 
    prox = TRUE, ntree = 50)
forestIris3 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris, 
    prox = TRUE, ntree = 50)
forestIris1

## 
## Call:
##  randomForest(formula = Species ~ Petal.Width + Petal.Length,      data = iris, prox = TRUE, ntree = 50) 
##                Type of random forest: classification
##                      Number of trees: 50
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 4%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         47         3        0.06
## virginica       0          3        47        0.06

forestIris2

## 
## Call:
##  randomForest(formula = Species ~ Petal.Width + Petal.Length,      data = iris, prox = TRUE, ntree = 50) 
##                Type of random forest: classification
##                      Number of trees: 50
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 4%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         47         3        0.06
## virginica       0          3        47        0.06

forestIris3

## 
## Call:
##  randomForest(formula = Species ~ Petal.Width + Petal.Length,      data = iris, prox = TRUE, ntree = 50) 
##                Type of random forest: classification
##                      Number of trees: 50
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 4%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         47         3        0.06
## virginica       0          3        47        0.06

model <- combine(forestIris1, forestIris2, forestIris3)
model

## 
## Call:
##  randomForest(formula = Species ~ Petal.Width + Petal.Length,      data = iris, prox = TRUE, ntree = 50) 
##                Type of random forest: classification
##                      Number of trees: 150
## No. of variables tried at each split: 1

Metriz de confusão para o novo modelo:

table(predict(model, iris), iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         0
##   virginica       0          1        50

Predizer as classes para novos objetos:

newdata <- data.frame(Sepal.Length <- rnorm(1000, mean(iris$Sepal.Length), sd(iris$Sepal.Length)), 
    Sepal.Width <- rnorm(1000, mean(iris$Sepal.Width), sd(iris$Sepal.Width)), 
    Petal.Width <- rnorm(1000, mean(iris$Petal.Width), sd(iris$Petal.Width)), 
    Petal.Length <- rnorm(1000, mean(iris$Petal.Length), sd(iris$Petal.Length)))

pred <- predict(model, newdata)

Mostrando como o random forest é capaz de separar conjunto de dados que não são separáveis linearmente:

plot(newdata[, 4], newdata[, 3], pch = 21, xlab = "Petal.Length", ylab = "Petal.Width", 
    bg = c("red", "blue", "green")[as.numeric(pred)], main = "Novos dados")

Sabatina: Replicar a aula sobre random forest utilizando a base de dados Pima.tr e Pima.te, do pacote MASS.Usar 2 variáveis mostrando o por que da escolha e achar a melhor classificação possivel.

Random Forest

Lucas Eudardo Wichinevsky & Lucas Tonegi

29 de Maio de 2016