library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Explicando a escolha das variáveis utilizadas.
library(randomForest)
data(iris)
set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
proximity=TRUE)
print(iris.rf)
##
## Call:
## randomForest(formula = Species ~ ., data = iris, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 5.33%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 46 4 0.08
## virginica 0 4 46 0.08
## importancia das variaveis:
round(importance(iris.rf), 2)
## setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length 6.04 7.85 7.93 11.51
## Sepal.Width 4.40 1.03 5.44 5.40
## Petal.Length 21.76 31.33 29.64 32.94
## Petal.Width 22.84 32.67 31.68 34.50
## MeanDecreaseGini
## Sepal.Length 8.77
## Sepal.Width 2.19
## Petal.Length 42.54
## Petal.Width 45.77
##Gráfico:
iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE)
op <- par(pty="s")
pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0,
col=c("red", "green", "blue")[as.numeric(iris$Species)],
main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")
par(op)
Etapa utilizando a base de treinamento:
forestIris <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE)
forestIris
##
## Call:
## randomForest(formula = Species ~ Petal.Width + Petal.Length, data = iris, prox = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 3.33%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 2 48 0.04
Recuperando uma das árvores geradas:
getTree(forestIris, k = 2)
## left daughter right daughter split var split point status prediction
## 1 2 3 1 1.55 1 0
## 2 4 5 2 2.45 1 0
## 3 0 0 0 0.00 -1 3
## 4 0 0 0 0.00 -1 1
## 5 6 7 2 4.95 1 0
## 6 0 0 0 0.00 -1 2
## 7 0 0 0 0.00 -1 3
Matriz de confusão:
table(predict(forestIris), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 2
## virginica 0 3 48
prop.table(table(predict(forestIris), iris$Species))
##
## setosa versicolor virginica
## setosa 0.33333333 0.00000000 0.00000000
## versicolor 0.00000000 0.31333333 0.01333333
## virginica 0.00000000 0.02000000 0.32000000
Identificação dos centros das classes:
iris.p <- classCenter(iris[, c(3, 4)], iris$Species, forestIris$prox)
plot(iris[, 3], iris[, 4], pch = 21, xlab = names(iris)[3], ylab = names(iris)[4],
bg = c("red", "blue", "green")[as.numeric(factor(iris$Species))], main = "Dados sobre as plantas Iris com os centros das classes")
points(iris.p[, 1], iris.p[, 2], pch = 21, cex = 2, bg = c("red", "blue", "green"))
Combinando alguns modelos:
forestIris1 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE, ntree = 50)
forestIris2 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE, ntree = 50)
forestIris3 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE, ntree = 50)
forestIris1
##
## Call:
## randomForest(formula = Species ~ Petal.Width + Petal.Length, data = iris, prox = TRUE, ntree = 50)
## Type of random forest: classification
## Number of trees: 50
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 4%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 3 47 0.06
forestIris2
##
## Call:
## randomForest(formula = Species ~ Petal.Width + Petal.Length, data = iris, prox = TRUE, ntree = 50)
## Type of random forest: classification
## Number of trees: 50
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 4%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 3 47 0.06
forestIris3
##
## Call:
## randomForest(formula = Species ~ Petal.Width + Petal.Length, data = iris, prox = TRUE, ntree = 50)
## Type of random forest: classification
## Number of trees: 50
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 4%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 3 47 0.06
model <- combine(forestIris1, forestIris2, forestIris3)
model
##
## Call:
## randomForest(formula = Species ~ Petal.Width + Petal.Length, data = iris, prox = TRUE, ntree = 50)
## Type of random forest: classification
## Number of trees: 150
## No. of variables tried at each split: 1
Metriz de confusão para o novo modelo:
table(predict(model, iris), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 0
## virginica 0 1 50
Predizer as classes para novos objetos:
newdata <- data.frame(Sepal.Length <- rnorm(1000, mean(iris$Sepal.Length), sd(iris$Sepal.Length)),
Sepal.Width <- rnorm(1000, mean(iris$Sepal.Width), sd(iris$Sepal.Width)),
Petal.Width <- rnorm(1000, mean(iris$Petal.Width), sd(iris$Petal.Width)),
Petal.Length <- rnorm(1000, mean(iris$Petal.Length), sd(iris$Petal.Length)))
pred <- predict(model, newdata)
Mostrando como o random forest é capaz de separar conjunto de dados que não são separáveis linearmente:
plot(newdata[, 4], newdata[, 3], pch = 21, xlab = "Petal.Length", ylab = "Petal.Width",
bg = c("red", "blue", "green")[as.numeric(pred)], main = "Novos dados")
Sabatina: Replicar a aula sobre random forest utilizando a base de dados Pima.tr e Pima.te, do pacote MASS.Usar 2 variáveis mostrando o por que da escolha e achar a melhor classificação possivel.