1 Métodos para árvores de regressão

1.1 Carregar os dados

rm(list = objects())
library(lattice)
library(latticeExtra)
#-----------------------------------------------------------------------
# Produção de teca.

# Endereço das tabelas.
pre <- "https://raw.githubusercontent.com/walmes/EACS/master/data-raw/"
files <- c(hidric = "teca_crapar.csv",
           quimic = "teca_qui.csv",
           prod = "teca_arv.csv")
urls <- paste0(pre, files)
names(urls) <- names(files)

# Lista com as tabelas.
da <- sapply(urls,
             FUN = read.table,
             header = TRUE,
             sep = ";",
             simplify = FALSE)
str(da)
## List of 3
##  $ hidric:'data.frame':  141 obs. of  10 variables:
##   ..$ loc: int [1:141] 1 1 1 2 2 2 3 3 3 4 ...
##   ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 3 2 1 1 3 2 1 3 2 3 ...
##   ..$ Ur : num [1:141] 0.178 0.188 0.223 0.131 0.165 ...
##   ..$ Us : num [1:141] 0.58 0.647 0.648 0.697 0.573 ...
##   ..$ alp: num [1:141] -0.833 -0.722 -0.724 -0.548 -0.547 ...
##   ..$ n  : num [1:141] 3.11 3.25 3.59 4.01 2.92 ...
##   ..$ I  : num [1:141] 0.957 0.835 0.815 0.62 0.691 ...
##   ..$ Ui : num [1:141] 0.396 0.435 0.45 0.431 0.387 ...
##   ..$ S  : num [1:141] -0.273 -0.329 -0.341 -0.515 -0.257 ...
##   ..$ cad: num [1:141] 0.217 0.247 0.227 0.3 0.222 ...
##  $ quimic:'data.frame':  150 obs. of  15 variables:
##   ..$ loc: int [1:150] 1 1 1 2 2 2 3 3 3 4 ...
##   ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 1 3 2 1 3 2 1 3 2 1 ...
##   ..$ ph : num [1:150] 6.8 6.7 6.7 4.7 4.7 4.9 7.6 6.8 6.9 6.6 ...
##   ..$ p  : num [1:150] 22.51 0.83 0.01 3.89 0.69 ...
##   ..$ k  : num [1:150] 72.24 13.42 7.23 48.13 12.34 ...
##   ..$ ca : num [1:150] 8.27 2.91 2.33 0.97 0.76 0.21 7.63 3.22 2.22 5.54 ...
##   ..$ mg : num [1:150] 1.7 1.77 0.51 0.16 0.14 0 1.53 1.31 1.09 1.77 ...
##   ..$ al : num [1:150] 0 0 0 0.3 0.6 0.6 0 0 0 0 ...
##   ..$ ctc: num [1:150] 12.47 6.57 4.52 5.3 4.17 ...
##   ..$ sat: num [1:150] 81.4 71.7 63.2 23.7 22.4 ...
##   ..$ mo : num [1:150] 72.2 25.6 9.7 34.4 8.7 9.7 50.4 15.6 9.5 50.2 ...
##   ..$ arg: num [1:150] 184 215 286 232 213 ...
##   ..$ are: num [1:150] 770 750 674 741 775 ...
##   ..$ cas: num [1:150] 1.8 2.2 3.7 0.4 1.1 1.7 8.4 19 14.3 5.5 ...
##   ..$ acc: num [1:150] 770 750 676 741 775 ...
##  $ prod  :'data.frame':  50 obs. of  5 variables:
##   ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ alt : num [1:50] 12.9 11.9 22.5 20.4 15.6 ...
##   ..$ dap : num [1:50] 0.194 0.182 0.348 0.304 0.227 ...
##   ..$ vol : num [1:50] 0.2707 0.0963 0.4974 0.4074 0.2165 ...
##   ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...
# Manipular as tabelas para fazer o merge.
da$quimic <- subset(da$quimic, cam == "[0, 5)", select = -cam)
da$hidric <- subset(da$hidric, cam == "[0, 5)", select = -c(cam, cad))
da$prod <- subset(da$prod, select = c(loc, prod))
str(da)
## List of 3
##  $ hidric:'data.frame':  48 obs. of  8 variables:
##   ..$ loc: int [1:48] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ Ur : num [1:48] 0.223 0.131 0.225 0.233 0.172 ...
##   ..$ Us : num [1:48] 0.648 0.697 0.614 0.647 0.844 ...
##   ..$ alp: num [1:48] -0.724 -0.548 -0.707 -0.555 -0.25 ...
##   ..$ n  : num [1:48] 3.59 4.01 2.53 2.91 2.1 ...
##   ..$ I  : num [1:48] 0.815 0.62 0.905 0.7 0.557 ...
##   ..$ Ui : num [1:48] 0.45 0.431 0.44 0.459 0.556 ...
##   ..$ S  : num [1:48] -0.341 -0.515 -0.206 -0.26 -0.278 ...
##  $ quimic:'data.frame':  50 obs. of  14 variables:
##   ..$ loc: int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ ph : num [1:50] 6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
##   ..$ p  : num [1:50] 22.51 3.89 11.52 26.43 3.79 ...
##   ..$ k  : num [1:50] 72.2 48.1 114.5 54.3 56.8 ...
##   ..$ ca : num [1:50] 8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
##   ..$ mg : num [1:50] 1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
##   ..$ al : num [1:50] 0 0.3 0 0 0 0 0 0 0 0.1 ...
##   ..$ ctc: num [1:50] 12.47 5.3 12.41 9.11 7.57 ...
##   ..$ sat: num [1:50] 81.4 23.7 76.1 81.8 69.4 ...
##   ..$ mo : num [1:50] 72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
##   ..$ arg: num [1:50] 184 232 193 142 236 ...
##   ..$ are: num [1:50] 770 741 716 790 733 ...
##   ..$ cas: num [1:50] 1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
##   ..$ acc: num [1:50] 770 741 718 791 734 ...
##  $ prod  :'data.frame':  50 obs. of  2 variables:
##   ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...
# Aplica o merge recursivamente.
teca <- Reduce(f = merge, x = da)

# Elimina a variável identificadora (agora desnecessária).
teca$loc <- NULL

# Estrutura da tabela.
str(teca)
## 'data.frame':    48 obs. of  21 variables:
##  $ Ur  : num  0.223 0.131 0.225 0.233 0.172 ...
##  $ Us  : num  0.648 0.697 0.614 0.647 0.844 ...
##  $ alp : num  -0.724 -0.548 -0.707 -0.555 -0.25 ...
##  $ n   : num  3.59 4.01 2.53 2.91 2.1 ...
##  $ I   : num  0.815 0.62 0.905 0.7 0.557 ...
##  $ Ui  : num  0.45 0.431 0.44 0.459 0.556 ...
##  $ S   : num  -0.341 -0.515 -0.206 -0.26 -0.278 ...
##  $ ph  : num  6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
##  $ p   : num  22.51 3.89 11.52 26.43 3.79 ...
##  $ k   : num  72.2 48.1 114.5 54.3 56.8 ...
##  $ ca  : num  8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
##  $ mg  : num  1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
##  $ al  : num  0 0.3 0 0 0 0 0 0 0 0.1 ...
##  $ ctc : num  12.47 5.3 12.41 9.11 7.57 ...
##  $ sat : num  81.4 23.7 76.1 81.8 69.4 ...
##  $ mo  : num  72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
##  $ arg : num  184 232 193 142 236 ...
##  $ are : num  770 741 716 790 733 ...
##  $ cas : num  1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
##  $ acc : num  770 741 718 791 734 ...
##  $ prod: num  68.2 24.3 125.3 102.7 54.6 ...
#-----------------------------------------------------------------------
# Preço de imóveis para 7 bairros em Curitiba.

u <- "http://www.leg.ufpr.br/~walmes/data/ap_venda7bairros_cwb_210314.txt"
ap <- read.table(file = u, header = TRUE, sep = "\t")
# str(ap)

# Usar o log do preço e da metragem.
ap <- transform(ap,
                larea = log10(area),
                lpreco = log10(preco),
                preco = NULL,
                area = NULL)

# Exclui outliers.
# plot(lpreco ~ larea, data = ap)
# dput(with(ap, identify(larea, lpreco)))

ap <- ap[-c(1966L, 2696L, 3267L), ]
ap <- subset(ap, lpreco > 4)
rownames(ap) <- NULL
str(ap)
## 'data.frame':    4467 obs. of  7 variables:
##  $ quartos  : int  2 3 2 2 3 3 3 3 3 3 ...
##  $ banheiros: int  2 2 2 2 2 2 2 5 1 1 ...
##  $ vagas    : int  1 1 2 1 1 1 1 2 1 1 ...
##  $ suites   : int  1 1 1 1 1 1 1 3 1 1 ...
##  $ bairro   : Factor w/ 7 levels "agua-verde","batel",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ larea    : num  1.8 1.88 1.8 1.8 1.88 ...
##  $ lpreco   : num  5.46 5.52 5.57 5.47 5.41 ...
#-----------------------------------------------------------------------
# Análise exploratória.

xyplot(lpreco ~ larea | cut(vagas, c(0:3, Inf)),
       data = ap,
       as.table = TRUE)

xyplot(lpreco ~ larea | bairro,
       data = ap,
       as.table = TRUE) +
    layer(panel.smoother(...))

1.2 Árvore de regressão

#-----------------------------------------------------------------------
# Ajuste de árvore de regressão para os dados de teca.

library(rpart)
library(rpart.plot)
# help(rpart, help_type = "html")

# Ajuste do modelo.
m0 <- rpart(prod ~ ., data = teca)

# Resumo do ajuste.
summary(m0)
## Call:
## rpart(formula = prod ~ ., data = teca)
##   n= 48 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.62742947      0 1.0000000 1.0876071 0.18680894
## 2 0.07594851      1 0.3725705 0.3962829 0.08804663
## 3 0.01000000      2 0.2966220 0.4585990 0.10622713
## 
## Variable importance
##  ca  ph sat ctc are  mo   I   k acc   n 
##  21  18  18  18  11  10   1   1   1   1 
## 
## Node number 1: 48 observations,    complexity param=0.6274295
##   mean=82.81385, MSE=1317.39 
##   left son=2 (19 obs) right son=3 (29 obs)
##   Primary splits:
##       ca  < 4.14      to the left,  improve=0.6274295, (0 missing)
##       ctc < 8.97      to the left,  improve=0.5302188, (0 missing)
##       sat < 72.875    to the left,  improve=0.5157282, (0 missing)
##       ph  < 6.25      to the left,  improve=0.4864565, (0 missing)
##       k   < 85.805    to the left,  improve=0.3785542, (0 missing)
##   Surrogate splits:
##       ctc < 8.97      to the left,  agree=0.938, adj=0.842, (0 split)
##       sat < 72.875    to the left,  agree=0.917, adj=0.789, (0 split)
##       ph  < 6.25      to the left,  agree=0.896, adj=0.737, (0 split)
##       are < 631.25    to the right, agree=0.812, adj=0.526, (0 split)
##       mo  < 37.05     to the left,  agree=0.792, adj=0.474, (0 split)
## 
## Node number 2: 19 observations
##   mean=47.29476, MSE=442.1655 
## 
## Node number 3: 29 observations,    complexity param=0.07594851
##   mean=106.085, MSE=522.6983 
##   left son=6 (18 obs) right son=7 (11 obs)
##   Primary splits:
##       ph  < 6.95      to the left,  improve=0.3168297, (0 missing)
##       cas < 41.4      to the left,  improve=0.3025569, (0 missing)
##       ca  < 8.58      to the left,  improve=0.2635895, (0 missing)
##       ctc < 12.715    to the left,  improve=0.2252505, (0 missing)
##       k   < 101.29    to the left,  improve=0.1968285, (0 missing)
##   Surrogate splits:
##       I   < 0.6788505 to the right, agree=0.793, adj=0.455, (0 split)
##       sat < 86.18     to the left,  agree=0.793, adj=0.455, (0 split)
##       k   < 96.6      to the left,  agree=0.759, adj=0.364, (0 split)
##       n   < 3.908742  to the left,  agree=0.724, adj=0.273, (0 split)
##       acc < 632.105   to the left,  agree=0.724, adj=0.273, (0 split)
## 
## Node number 6: 18 observations
##   mean=96.02497, MSE=160.483 
## 
## Node number 7: 11 observations
##   mean=122.5468, MSE=678.8158
# Visualização da árvore de regressão.
rpart.plot(m0)

# Valores preditos.
predict(m0)
##         1         2         3         4         5         6         7 
##  96.02497  47.29476 122.54682  96.02497  47.29476  47.29476  47.29476 
##         8         9        10        11        12        13        14 
##  96.02497  47.29476  47.29476 122.54682 122.54682  96.02497  47.29476 
##        15        16        17        18        19        20        21 
##  47.29476  47.29476 122.54682  96.02497 122.54682  47.29476  96.02497 
##        22        23        24        25        26        27        28 
##  47.29476 122.54682  47.29476  96.02497 122.54682 122.54682 122.54682 
##        29        30        31        32        33        34        35 
##  96.02497  47.29476  96.02497 122.54682  47.29476  47.29476  47.29476 
##        36        37        38        39        40        41        42 
##  96.02497  47.29476  47.29476  96.02497  96.02497  96.02497  96.02497 
##        43        44        45        46        47        48 
##  96.02497  47.29476 122.54682  96.02497  96.02497  96.02497
# Valores preditos (médias em cada região).
unique(sort(predict(m0)))
## [1]  47.29476  96.02497 122.54682
table(sort(predict(m0)))
## 
## 47.2947619118874 96.0249682912905 122.546820129111 
##               19               18               11
#-----------------------------------------------------------------------
# Deixar a árvore crescer mais.

m1 <- rpart(prod ~ .,
            data = teca,
            control = list(minsplit = 5,
                           cp = 0.001))

rpart.plot(m1)

# Valores preditos (médias em cada região).
unique(sort(predict(m1)))
## [1]  31.66904  58.65893  90.26256 101.78738 122.54682
#-----------------------------------------------------------------------
# Ajuste para os dados de imóveis.

m0 <- rpart(lpreco ~ ., data = ap)

# Resumo do ajuste.
summary(m0)
## Call:
## rpart(formula = lpreco ~ ., data = ap)
##   n= 4467 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.55273200      0 1.0000000 1.0006471 0.022309188
## 2 0.10528202      1 0.4472680 0.4543668 0.010557382
## 3 0.08675949      2 0.3419860 0.3451494 0.006988209
## 4 0.02481863      3 0.2552265 0.2598557 0.005719558
## 5 0.02041587      4 0.2304079 0.2285848 0.005312121
## 6 0.01628417      5 0.2099920 0.2126618 0.005200752
## 7 0.01526750      6 0.1937078 0.1943037 0.004587582
## 8 0.01073530      7 0.1784403 0.1809174 0.004398348
## 9 0.01000000      8 0.1677050 0.1755710 0.004332548
## 
## Variable importance
##   larea quartos   vagas  bairro  suites 
##      54      21      14      10       1 
## 
## Node number 1: 4467 observations,    complexity param=0.552732
##   mean=5.755241, MSE=0.09402792 
##   left son=2 (2831 obs) right son=3 (1636 obs)
##   Primary splits:
##       larea     < 2.093071 to the left,  improve=0.5527320, (0 missing)
##       vagas     < 2.5      to the left,  improve=0.4074442, (532 missing)
##       quartos   < 3.5      to the left,  improve=0.3332465, (48 missing)
##       suites    < 1.5      to the left,  improve=0.3233594, (1159 missing)
##       banheiros < 2.5      to the left,  improve=0.2814297, (1029 missing)
##   Surrogate splits:
##       quartos < 3.5      to the left,  agree=0.793, adj=0.435, (0 split)
##       bairro  splits as  LRLLLRL,      agree=0.710, adj=0.208, (0 split)
##       vagas   < 2.5      to the left,  agree=0.694, adj=0.163, (0 split)
## 
## Node number 2: 2831 observations,    complexity param=0.08675949
##   mean=5.581937, MSE=0.03292292 
##   left son=4 (1497 obs) right son=5 (1334 obs)
##   Primary splits:
##       larea     < 1.876247 to the left,  improve=0.3909773, (0 missing)
##       vagas     < 1.5      to the left,  improve=0.2576053, (464 missing)
##       quartos   < 1.5      to the left,  improve=0.1730252, (25 missing)
##       banheiros < 1.5      to the left,  improve=0.1709035, (643 missing)
##       bairro    splits as  RRRLLRL,      improve=0.1645266, (0 missing)
##   Surrogate splits:
##       quartos   < 2.5      to the left,  agree=0.786, adj=0.546, (0 split)
##       bairro    splits as  RRRLLRL,      agree=0.636, adj=0.228, (0 split)
##       vagas     < 1.5      to the left,  agree=0.612, adj=0.176, (0 split)
##       banheiros < 1.5      to the left,  agree=0.539, adj=0.021, (0 split)
## 
## Node number 3: 1636 observations,    complexity param=0.105282
##   mean=6.055132, MSE=0.05785938 
##   left son=6 (1154 obs) right son=7 (482 obs)
##   Primary splits:
##       larea     < 2.358287 to the left,  improve=0.4671646, (0 missing)
##       vagas     < 2.5      to the left,  improve=0.4056675, (68 missing)
##       suites    < 3.5      to the left,  improve=0.3204377, (172 missing)
##       banheiros < 5.5      to the left,  improve=0.1420578, (386 missing)
##       quartos   < 3.5      to the left,  improve=0.1340611, (23 missing)
##   Surrogate splits:
##       vagas   < 3.5      to the left,  agree=0.796, adj=0.309, (0 split)
##       suites  < 3.5      to the left,  agree=0.718, adj=0.041, (0 split)
##       quartos < 4.5      to the left,  agree=0.711, adj=0.019, (0 split)
## 
## Node number 4: 1497 observations,    complexity param=0.02041587
##   mean=5.474836, MSE=0.02047765 
##   left son=8 (565 obs) right son=9 (932 obs)
##   Primary splits:
##       larea     < 1.73882  to the left,  improve=0.27972990, (0 missing)
##       quartos   < 1.5      to the left,  improve=0.11830160, (18 missing)
##       banheiros < 1.5      to the left,  improve=0.11754160, (384 missing)
##       bairro    splits as  RRRLLRL,      improve=0.09454728, (0 missing)
##       vagas     < 1.5      to the left,  improve=0.04037166, (375 missing)
##   Surrogate splits:
##       quartos < 1.5      to the left,  agree=0.868, adj=0.651, (0 split)
##       bairro  splits as  RRRLRRR,      agree=0.721, adj=0.262, (0 split)
## 
## Node number 5: 1334 observations,    complexity param=0.0152675
##   mean=5.702124, MSE=0.01957181 
##   left son=10 (527 obs) right son=11 (807 obs)
##   Primary splits:
##       vagas     < 1.5      to the left,  improve=0.27545610, (89 missing)
##       larea     < 1.963929 to the left,  improve=0.18560880, (0 missing)
##       bairro    splits as  RRRLLRL,      improve=0.10414990, (0 missing)
##       banheiros < 2.5      to the left,  improve=0.08565034, (259 missing)
##       suites    < 1.5      to the left,  improve=0.04342607, (218 missing)
##   Surrogate splits:
##       larea  < 1.906657 to the left,  agree=0.645, adj=0.14, (89 split)
##       bairro splits as  RRRLLRR,      agree=0.612, adj=0.06, (0 split)
## 
## Node number 6: 1154 observations,    complexity param=0.02481863
##   mean=5.948879, MSE=0.02474197 
##   left son=12 (777 obs) right son=13 (377 obs)
##   Primary splits:
##       vagas     < 2.5      to the left,  improve=0.33614730, (54 missing)
##       larea     < 2.211788 to the left,  improve=0.28684050, (0 missing)
##       suites    < 1.5      to the left,  improve=0.25779510, (128 missing)
##       bairro    splits as  RRRLLRL,      improve=0.11106200, (0 missing)
##       banheiros < 3.5      to the left,  improve=0.08901843, (253 missing)
##   Surrogate splits:
##       larea  < 2.239298 to the left,  agree=0.73, adj=0.175, (54 split)
##       bairro splits as  LLLLLRL,      agree=0.68, adj=0.022, (0 split)
## 
## Node number 7: 482 observations,    complexity param=0.01628417
##   mean=6.309523, MSE=0.04540429 
##   left son=14 (238 obs) right son=15 (244 obs)
##   Primary splits:
##       vagas     < 3.5      to the left,  improve=0.31065010, (14 missing)
##       suites    < 3.5      to the left,  improve=0.30449720, (44 missing)
##       larea     < 2.54512  to the left,  improve=0.25779660, (0 missing)
##       bairro    splits as  LRRLLRL,      improve=0.08727943, (0 missing)
##       banheiros < 5.5      to the left,  improve=0.08022878, (133 missing)
##   Surrogate splits:
##       larea   < 2.529554 to the left,  agree=0.686, adj=0.352, (14 split)
##       suites  < 3.5      to the left,  agree=0.667, adj=0.313, (0 split)
##       bairro  splits as  LLRLLRL,      agree=0.656, adj=0.291, (0 split)
##       quartos < 3.5      to the left,  agree=0.588, adj=0.150, (0 split)
## 
## Node number 8: 565 observations
##   mean=5.37763, MSE=0.01696386 
## 
## Node number 9: 932 observations
##   mean=5.533765, MSE=0.013407 
## 
## Node number 10: 527 observations
##   mean=5.616327, MSE=0.01477494 
## 
## Node number 11: 807 observations
##   mean=5.758153, MSE=0.01475799 
## 
## Node number 12: 777 observations,    complexity param=0.0107353
##   mean=5.882675, MSE=0.01808852 
##   left son=24 (126 obs) right son=25 (651 obs)
##   Primary splits:
##       vagas     < 1.5      to the left,  improve=0.23951310, (37 missing)
##       larea     < 2.209783 to the left,  improve=0.14406520, (0 missing)
##       suites    < 1.5      to the left,  improve=0.11933820, (90 missing)
##       bairro    splits as  RRRLRRR,      improve=0.11233650, (0 missing)
##       banheiros < 3.5      to the left,  improve=0.05954215, (167 missing)
##   Surrogate splits:
##       bairro splits as  RRRLRRR, agree=0.865, adj=0.099, (37 split)
## 
## Node number 13: 377 observations
##   mean=6.085325, MSE=0.0108039 
## 
## Node number 14: 238 observations
##   mean=6.188907, MSE=0.03177745 
## 
## Node number 15: 244 observations
##   mean=6.427172, MSE=0.03066439 
## 
## Node number 24: 126 observations
##   mean=5.709519, MSE=0.01694193 
## 
## Node number 25: 651 observations
##   mean=5.916189, MSE=0.01138406
# Visualização da árvore de regressão.
rpart.plot(m0)

# Visualização alternativa.
plot(m0)
text(m0)

# Valores preditos (médias em cada região).
unique(sort(predict(m0)))
## [1] 5.377630 5.533765 5.616327 5.709519 5.758153 5.916189 6.085325 6.188907
## [9] 6.427172
# Importância das variáveis.
cbind(m0$variable.importance)
##                  [,1]
## larea     326.4799195
## quartos   128.2119647
## vagas      84.9330547
## bairro     61.7563236
## suites      3.9613041
## banheiros   0.7648777
# Soma de quadrados residual do modelo nulo.
m0$frame$dev[1]                      # SQres do ~1.
## [1] 420.0227
(m0$frame$n[1] - 1) * var(ap$lpreco) # SQres do ~1.
## [1] 420.0227
# R².
1 - sum(residuals(m0)^2)/(m0$frame$dev[1])
## [1] 0.832295
# Criando um grid nas variáveis consideradas pela árvore.
grid <- with(ap,
             expand.grid(larea = seq(min(larea, na.rm = TRUE),
                                     max(larea, na.rm = TRUE),
                                     length.out = 50),
                         vagas = seq(min(vagas, na.rm = TRUE),
                                     max(vagas, na.rm = TRUE),
                                     length.out = 50),
                         bairro = levels(bairro)[1],
                         quartos = median(quartos, na.rm = TRUE),
                         banheiros = median(banheiros, na.rm = TRUE),
                         suites = median(suites, na.rm = TRUE),
                         KEEP.OUT.ATTRS = FALSE))
str(grid)
## 'data.frame':    2500 obs. of  6 variables:
##  $ larea    : num  1.24 1.27 1.31 1.34 1.38 ...
##  $ vagas    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ bairro   : Factor w/ 1 level "agua-verde": 1 1 1 1 1 1 1 1 1 1 ...
##  $ quartos  : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ banheiros: num  2 2 2 2 2 2 2 2 2 2 ...
##  $ suites   : num  1 1 1 1 1 1 1 1 1 1 ...
# Predição para os valores no grid.
grid$y <- predict(m0, newdata = grid)

# Um fator para indicar as diferentes regiões.
yp <- predict(m0, newdata = ap)
yp <- rank(yp, ties.method = "min")
yp <- as.integer(factor(yp))

# Visualização das regiões criadas pelos cortes perpendiculares aos
# eixos.
levelplot(y ~ larea + vagas, data = grid, contour = TRUE) +
    layer(panel.points(larea, vagas, col = yp), data = ap)

# Diagrama de dispersão 3D.
cloud(lpreco ~ larea + vagas, data = ap, col = yp)

# Os patamares.
wireframe(y ~ larea + vagas, data = grid, drape = TRUE)

# Deixar a árvore crescer mais.
m0 <- rpart(lpreco ~ .,
            data = ap,
            control = list(cp = 0.0025))

# Visualização da árvore de regressão.
rpart.plot(m0)

# Visualização alternativa.
plot(m0)
text(m0)

# Valores preditos (médias em cada região).
unique(sort(predict(m0)))
##  [1] 5.302384 5.417912 5.455249 5.563499 5.599947 5.681727 5.709519
##  [8] 5.743817 5.789740 5.886212 5.974917 6.085325 6.124560 6.280833
## [15] 6.344578 6.515365
# Importância das variáveis.
cbind(m0$variable.importance)
##                  [,1]
## larea     333.3563170
## quartos   128.7666179
## vagas      85.4301102
## bairro     65.8767544
## suites      6.2421718
## banheiros   0.7648777
# R².
1 - sum(residuals(m0)^2)/(m0$frame$dev[1])
## [1] 0.8591228

1.3 Árvores de regressão com bagging

# Sempre fazer a predição com esses inputs.
pred <- subset(ap, select = -lpreco)
names(pred)
## [1] "quartos"   "banheiros" "vagas"     "suites"    "bairro"    "larea"
# ID de cada registro.
n <- nrow(ap)
s <- 1:n

# Índices para amostragem com reposição (bootstrap).
i <- sample(s, size = n, replace = TRUE)

# Qual a proporção de registros únicos tomados?
u <- unique(i)
length(u)/n
## [1] 0.6330871
# Quais as observações que ficaram de fora?
out <- which(!(s %in% u))
head(out)
## [1]  3  4  5  6  7 16
tail(out)
## [1] 4446 4447 4456 4458 4463 4465
# Amostra boostrap da tabela.
ap_bs <- ap[i, ]

# Ajuste do modelo.
m_bs <- rpart(lpreco ~ ., data = ap_bs)

# Valores preditos.
y_bs <- predict(m_bs, newdata = pred)
head(y_bs)
##        1        2        3        4        5        6 
## 5.533305 5.533305 5.533305 5.533305 5.533305 5.533305
# Out of bag mean square error.
sum((ap[out, ]$lpreco -
     predict(m_bs, newdata = pred[out, ]))^2)/length(out)
## [1] 0.01614876
#-----------------------------------------------------------------------
# Repetir B vezes.

set.seed(102030)
B <- 200
j <- 1
frac <- numeric(B)
fits <- replicate(B,
                  simplify = FALSE,
                  expr = {
                      # Reamostra com reposição.
                      i <- sample(s, size = n, replace = TRUE)
                      frac[j] <<- length(unique(i))/n
                      j <<- j + 1
                      ap_bs <- ap[i, ]
                      # Ajuste da árvore aos dados de treino.
                      m_bs <- rpart(lpreco ~ ., data = ap_bs)
                      return(m_bs)
                  })

# A proporção de valores usados nas amostras bootstrap.
mean(frac)
## [1] 0.6322084
# Os preditos em cada "ensacamento" dos dados.
pred$y <- sapply(fits, FUN = predict, newdata = pred)
str(pred)
## 'data.frame':    4467 obs. of  7 variables:
##  $ quartos  : int  2 3 2 2 3 3 3 3 3 3 ...
##  $ banheiros: int  2 2 2 2 2 2 2 5 1 1 ...
##  $ vagas    : int  1 1 2 1 1 1 1 2 1 1 ...
##  $ suites   : int  1 1 1 1 1 1 1 3 1 1 ...
##  $ bairro   : Factor w/ 7 levels "agua-verde","batel",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ larea    : num  1.8 1.88 1.8 1.8 1.88 ...
##  $ y        : num [1:4467, 1:200] 5.51 5.6 5.51 5.51 5.6 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr  "1" "2" "3" "4" ...
##   .. ..$ : NULL
# Predição para os primeiros casos.
head(pred)
##   quartos banheiros vagas suites bairro    larea      y.1      y.2
## 1       2         2     1      1 portao 1.799341 5.511545 5.482001
## 2       3         2     1      1 portao 1.875061 5.603164 5.603398
## 3       2         2     2      1 portao 1.797752 5.511545 5.482001
## 4       2         2     1      1 portao 1.798513 5.511545 5.482001
## 5       3         2     1      1 portao 1.875061 5.603164 5.603398
## 6       3         2     1      1 portao 1.875061 5.603164 5.603398
##        y.3      y.4      y.5      y.6      y.7      y.8      y.9     y.10
## 1 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 2 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
## 3 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 4 5.517266 5.525659 5.417492 5.483622 5.516842 5.421217 5.539943 5.476510
## 5 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
## 6 5.517266 5.601769 5.592200 5.596915 5.516842 5.580467 5.539943 5.588948
##       y.11     y.12     y.13     y.14     y.15     y.16     y.17     y.18
## 1 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 2 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
## 3 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 4 5.530034 5.506683 5.424198 5.517418 5.487971 5.514049 5.525802 5.417721
## 5 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
## 6 5.614376 5.596937 5.589198 5.517418 5.583679 5.514049 5.525802 5.582104
##       y.19     y.20     y.21     y.22     y.23     y.24     y.25     y.26
## 1 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 2 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
## 3 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 4 5.545297 5.423683 5.523571 5.423676 5.418806 5.534867 5.519412 5.531598
## 5 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
## 6 5.545297 5.583326 5.611178 5.589373 5.588978 5.534867 5.609472 5.531598
##       y.27     y.28     y.29     y.30     y.31     y.32     y.33     y.34
## 1 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 2 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
## 3 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 4 5.551241 5.523415 5.491542 5.544313 5.553416 5.481691 5.417659 5.541571
## 5 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
## 6 5.551241 5.598014 5.581981 5.544313 5.553416 5.580614 5.576820 5.541571
##       y.35     y.36     y.37     y.38     y.39     y.40     y.41     y.42
## 1 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 2 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
## 3 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 4 5.414450 5.486812 5.529638 5.489836 5.409920 5.415181 5.544689 5.490350
## 5 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
## 6 5.584425 5.586810 5.529638 5.592012 5.587857 5.583763 5.544689 5.595722
##       y.43     y.44     y.45     y.46     y.47     y.48     y.49     y.50
## 1 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 2 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
## 3 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 4 5.524937 5.549808 5.483643 5.416401 5.520968 5.490118 5.508316 5.491341
## 5 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
## 6 5.524937 5.549808 5.589977 5.590423 5.520968 5.592157 5.619047 5.591869
##       y.51     y.52     y.53     y.54     y.55     y.56     y.57     y.58
## 1 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 2 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
## 3 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 4 5.425375 5.533985 5.417148 5.547536 5.535831 5.477805 5.527240 5.536265
## 5 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
## 6 5.586143 5.533985 5.589006 5.547536 5.599650 5.591776 5.618650 5.536265
##       y.59     y.60     y.61     y.62     y.63     y.64     y.65     y.66
## 1 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 2 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
## 3 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 4 5.502107 5.412507 5.414077 5.530113 5.470907 5.415142 5.464601 5.531353
## 5 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
## 6 5.594393 5.581721 5.575740 5.611105 5.587769 5.590499 5.590026 5.613648
##       y.67     y.68     y.69     y.70     y.71     y.72     y.73     y.74
## 1 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 2 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
## 3 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 4 5.504601 5.535521 5.463580 5.467870 5.427182 5.417166 5.546890 5.542992
## 5 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
## 6 5.595779 5.535521 5.589690 5.580569 5.583744 5.580957 5.546890 5.542992
##       y.75     y.76     y.77     y.78     y.79     y.80     y.81     y.82
## 1 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 2 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
## 3 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 4 5.421072 5.536914 5.412019 5.544779 5.537239 5.522028 5.418187 5.522203
## 5 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
## 6 5.587025 5.536914 5.583175 5.544779 5.537239 5.522028 5.580907 5.617147
##       y.83     y.84     y.85     y.86     y.87     y.88     y.89     y.90
## 1 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 2 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
## 3 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 4 5.555053 5.542352 5.550614 5.533137 5.529942 5.477303 5.410050 5.527185
## 5 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
## 6 5.555053 5.617270 5.550614 5.533137 5.613547 5.596330 5.582510 5.612262
##       y.91     y.92     y.93     y.94     y.95     y.96     y.97     y.98
## 1 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 2 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
## 3 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 4 5.478174 5.547971 5.467239 5.487656 5.413169 5.544515 5.545433 5.426759
## 5 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
## 6 5.583784 5.547971 5.589848 5.589980 5.576590 5.544515 5.545433 5.575836
##       y.99    y.100    y.101    y.102    y.103    y.104    y.105    y.106
## 1 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 2 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
## 3 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 4 5.525721 5.415982 5.463120 5.532691 5.412937 5.417542 5.544046 5.533738
## 5 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
## 6 5.614626 5.578604 5.575322 5.532691 5.577159 5.578380 5.544046 5.533738
##      y.107    y.108    y.109    y.110    y.111    y.112    y.113    y.114
## 1 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 2 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
## 3 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 4 5.506219 5.532807 5.537179 5.537178 5.506948 5.487108 5.416805 5.535733
## 5 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
## 6 5.595802 5.532807 5.623284 5.537178 5.613605 5.589655 5.574140 5.535733
##      y.115    y.116    y.117    y.118    y.119    y.120    y.121    y.122
## 1 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 2 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
## 3 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 4 5.423623 5.541694 5.515342 5.535200 5.535836 5.418830 5.489185 5.540095
## 5 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
## 6 5.582237 5.614404 5.612191 5.535200 5.535836 5.575769 5.589099 5.540095
##      y.123    y.124    y.125    y.126    y.127    y.128    y.129    y.130
## 1 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.566150
## 2 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
## 3 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.416422
## 4 5.517462 5.545816 5.412221 5.515359 5.425266 5.489404 5.509599 5.416422
## 5 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
## 6 5.611375 5.545816 5.590053 5.515359 5.582281 5.587766 5.611086 5.566150
##      y.131    y.132    y.133    y.134    y.135    y.136    y.137    y.138
## 1 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 2 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
## 3 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 4 5.488369 5.467402 5.512339 5.455007 5.492202 5.459637 5.507670 5.420021
## 5 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
## 6 5.599230 5.576309 5.594607 5.578229 5.581169 5.579355 5.588433 5.589116
##      y.139    y.140    y.141    y.142    y.143    y.144    y.145    y.146
## 1 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 2 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
## 3 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 4 5.417134 5.474799 5.534011 5.524157 5.546098 5.534172 5.506131 5.482230
## 5 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
## 6 5.577193 5.585907 5.534011 5.524157 5.546098 5.534172 5.588870 5.591275
##      y.147    y.148    y.149    y.150    y.151    y.152    y.153    y.154
## 1 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 2 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
## 3 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 4 5.508418 5.535394 5.531539 5.516210 5.424957 5.539019 5.541125 5.423453
## 5 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
## 6 5.508418 5.535394 5.531539 5.516210 5.579407 5.539019 5.541125 5.583511
##      y.155    y.156    y.157    y.158    y.159    y.160    y.161    y.162
## 1 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 2 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
## 3 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 4 5.532995 5.483591 5.528017 5.483539 5.519586 5.534374 5.427006 5.467505
## 5 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
## 6 5.532995 5.599904 5.528017 5.591774 5.519586 5.534374 5.578182 5.579520
##      y.163    y.164    y.165    y.166    y.167    y.168    y.169    y.170
## 1 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 2 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
## 3 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 4 5.468895 5.485674 5.542385 5.535081 5.554859 5.414541 5.415862 5.485898
## 5 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
## 6 5.586162 5.605892 5.542385 5.535081 5.554859 5.582213 5.580096 5.588619
##      y.171    y.172    y.173    y.174    y.175    y.176    y.177    y.178
## 1 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 2 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
## 3 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 4 5.523244 5.519855 5.418982 5.534619 5.532754 5.549795 5.501383 5.488853
## 5 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
## 6 5.523244 5.519855 5.579163 5.534619 5.532754 5.549795 5.603655 5.585438
##      y.179    y.180    y.181    y.182    y.183    y.184    y.185    y.186
## 1 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 2 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
## 3 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 4 5.547167 5.535195 5.468392 5.525500 5.420115 5.549078 5.495446 5.537746
## 5 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
## 6 5.547167 5.535195 5.580059 5.525500 5.590836 5.549078 5.586831 5.537746
##      y.187    y.188    y.189    y.190    y.191    y.192    y.193    y.194
## 1 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 2 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
## 3 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 4 5.552061 5.532770 5.538103 5.492900 5.529299 5.538471 5.539225 5.547166
## 5 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
## 6 5.552061 5.532770 5.538103 5.599009 5.609372 5.538471 5.539225 5.547166
##      y.195    y.196    y.197    y.198    y.199    y.200
## 1 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 2 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
## 3 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 4 5.421518 5.540621 5.487981 5.525663 5.415265 5.539568
## 5 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
## 6 5.582236 5.606974 5.600313 5.525663 5.585280 5.539568
# Estatísticas para o B valores preditos para alguns casos.
mean(as.vector(pred[1, "y"]))
## [1] 5.495819
var(as.vector(pred[1, "y"]))
## [1] 0.002245619
# O predito médio ("a sabedoria das multidões").
pred$ym <- rowMeans(pred$y)

# Predito contra observado.
# x11()
xyplot(pred$ym ~ ap$lpreco, aspect = "iso") +
    layer(panel.abline(a = 0, b = 1))

# Qual o predito para o imóvel mediano?
new <- lapply(pred,
              FUN = function(x) {
                  if (is.numeric(x)) {
                      median(x, na.rm = TRUE)
                  } else {
                      levels(x)[1]
                  }
              })
new
## $quartos
## [1] 3
## 
## $banheiros
## [1] 2
## 
## $vagas
## [1] 2
## 
## $suites
## [1] 1
## 
## $bairro
## [1] "agua-verde"
## 
## $larea
## [1] 2
## 
## $y
## [1] 5.734044
## 
## $ym
## [1] 5.750674
# Predito por cada árvore.
y <- sapply(fits, FUN = predict, newdata = new)

# Distribuição dos B valores preditos e valor médio.
plot(density(y))
rug(y)
m <- mean(y)
abline(v = m, col = 2)

m
## [1] 5.752909
library(ipred)

# help(package = "ipred", help_type = "html")

# Fazendo bagging.
bg <- bagging(lpreco ~ ., data = ap, nbagg = 200, coob = TRUE)
bg
## 
## Bagging regression trees with 200 bootstrap replications 
## 
## Call: bagging.data.frame(formula = lpreco ~ ., data = ap, nbagg = 200, 
##     coob = TRUE)
## 
## Out-of-bag estimate of root mean squared error:  0.1048
# Predito contra observado.
xyplot(predict(bg, newdata = ap) ~ ap$lpreco, aspect = "iso")

1.4 Árvores de regressão com random forests

#-----------------------------------------------------------------------
# Prototipando.

xvars <- names(teca)[1:20]
nv <- floor(length(xvars)/3)

# Seleciona variáveis preditoras.
v <- sample(xvars, size = nv, replace = FALSE)
v
## [1] "p"   "al"  "n"   "Ur"  "ca"  "acc"
# Seleciona registros.
n <- nrow(teca)
i <- sample(1:n, size = n, replace = TRUE)

# Ajusta o modelo.
m0 <- rpart(prod ~ .,
            data = teca[i, c(v, "prod")],
            control = list(minsplit = 3,
                           cp = 0.001))
m0
## n= 48 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 48 52391.7900  79.30677  
##    2) ca< 3.21 12  3050.7400  33.67443 *
##    3) ca>=3.21 36 16024.0800  94.51754  
##      6) ca< 8.58 27  8981.3940  87.90780  
##       12) n>=2.316487 19  2654.9530  81.34304  
##         24) n< 2.700025 9   630.5857  76.51017 *
##         25) n>=2.700025 10  1624.9690  85.69262 *
##       13) n< 2.316487 8  3562.9050 103.49910 *
##      7) ca>=8.58 9  2324.3050 114.34680 *
# Visualiza.
rpart.plot(m0)

# Replicar.
set.seed(302010)
B <- 1000
rf <- replicate(B,
                simplify = FALSE,
                expr = {
                    v <- sample(xvars, size = nv, replace = FALSE)
                    i <- sample(1:n, size = n, replace = TRUE)
                    m0 <- rpart(prod ~ .,
                                data = teca[i, c(v, "prod")],
                                control = list(minsplit = 3,
                                               cp = 0.01))
                    return(m0)
                })

# ATTENTION: na árvore de regressão, o sorteio das variáveis é feito
# após cada split e não uma única vez como o que está neste código.

# Obtenção dos preditos.
y_rf <- sapply(rf, FUN = predict, newdata = teca)
head(y_rf[, 1:6])
##        [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## 1 120.42047  86.03186 148.18260  88.03894 125.74756  89.45850
## 2  27.36223  45.04491  42.23671  33.10333  31.73443  36.10144
## 3 120.42047 132.64916 120.09817  88.03894  60.18041 126.74866
## 4  63.93345  97.72206  92.24409 102.20512  99.91257  89.45850
## 5  63.93345  45.04491  42.23671  49.02340  60.18041  71.36626
## 6  27.36223  45.04491  42.23671  33.10333  31.73443  36.10144
# Cálculo da média.
ym <- rowMeans(y_rf)

xyplot(ym ~ teca$prod,
       aspect = "iso",
       type = c("p", "smooth")) +
    layer(panel.abline(a = 0, b = 1))

# Correlação entre observado e predito.
cor(ym, teca$prod)
## [1] 0.8953036
#-----------------------------------------------------------------------

library(randomForest)

rf <- randomForest(prod ~ .,
                   data = teca,
                   ntree = 3,
                   mtry = 2,
                   keep.inbag = TRUE,
                   keep.forest = TRUE)
rf
## 
## Call:
##  randomForest(formula = prod ~ ., data = teca, ntree = 3, mtry = 2,      keep.inbag = TRUE, keep.forest = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 3
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 1161.826
##                     % Var explained: 11.81
1 - sum(rf$oob.times == 0)/length(rf$oob.times)
## [1] 0.7083333
str(rf)
## List of 18
##  $ call           : language randomForest(formula = prod ~ ., data = teca, ntree = 3, mtry = 2,      keep.inbag = TRUE, keep.forest = TRUE)
##  $ type           : chr "regression"
##  $ predicted      : Named num [1:48] 92.1 28.9 84.3 NA 59.1 ...
##   ..- attr(*, "names")= chr [1:48] "1" "2" "3" "4" ...
##  $ mse            : num [1:3] 1166 1270 1162
##  $ rsq            : num [1:3] 0.115 0.036 0.118
##  $ oob.times      : int [1:48] 1 1 1 0 1 1 0 0 1 0 ...
##  $ importance     : num [1:20, 1] 866.7 55.2 2243.9 0 368.8 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:20] "Ur" "Us" "alp" "n" ...
##   .. ..$ : chr "IncNodePurity"
##  $ importanceSD   : NULL
##  $ localImportance: NULL
##  $ proximity      : NULL
##  $ ntree          : num 3
##  $ mtry           : num 2
##  $ forest         :List of 11
##   ..$ ndbigtree    : int [1:3] 31 33 35
##   ..$ nodestatus   : int [1:35, 1:3] -3 -3 -3 -3 -3 -1 -3 -1 -1 -1 ...
##   ..$ leftDaughter : int [1:35, 1:3] 2 4 6 8 10 0 12 0 0 0 ...
##   ..$ rightDaughter: int [1:35, 1:3] 3 5 7 9 11 0 13 0 0 0 ...
##   ..$ nodepred     : num [1:35, 1:3] 88.6 41.5 108 48.2 36.4 ...
##   ..$ bestvar      : int [1:35, 1:3] 11 5 20 6 5 0 8 0 0 0 ...
##   ..$ xbestsplit   : num [1:35, 1:3] 3.145 0.604 438.77 0.477 0.667 ...
##   ..$ ncat         : Named int [1:20] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "names")= chr [1:20] "Ur" "Us" "alp" "n" ...
##   ..$ nrnodes      : int 35
##   ..$ ntree        : num 3
##   ..$ xlevels      :List of 20
##   .. ..$ Ur : num 0
##   .. ..$ Us : num 0
##   .. ..$ alp: num 0
##   .. ..$ n  : num 0
##   .. ..$ I  : num 0
##   .. ..$ Ui : num 0
##   .. ..$ S  : num 0
##   .. ..$ ph : num 0
##   .. ..$ p  : num 0
##   .. ..$ k  : num 0
##   .. ..$ ca : num 0
##   .. ..$ mg : num 0
##   .. ..$ al : num 0
##   .. ..$ ctc: num 0
##   .. ..$ sat: num 0
##   .. ..$ mo : num 0
##   .. ..$ arg: num 0
##   .. ..$ are: num 0
##   .. ..$ cas: num 0
##   .. ..$ acc: num 0
##  $ coefs          : NULL
##  $ y              : Named num [1:48] 68.2 24.3 125.3 102.7 54.6 ...
##   ..- attr(*, "names")= chr [1:48] "1" "2" "3" "4" ...
##  $ test           : NULL
##  $ inbag          : int [1:48, 1:3] 2 1 2 1 1 1 1 1 2 1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:48] "1" "2" "3" "4" ...
##   .. ..$ : NULL
##  $ terms          :Classes 'terms', 'formula'  language prod ~ Ur + Us + alp + n + I + Ui + S + ph + p + k + ca + mg + al +      ctc + sat + mo + arg + are + cas + acc
##   .. ..- attr(*, "variables")= language list(prod, Ur, Us, alp, n, I, Ui, S, ph, p, k, ca, mg, al, ctc, sat,      mo, arg, are, cas, acc)
##   .. ..- attr(*, "factors")= int [1:21, 1:20] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:21] "prod" "Ur" "Us" "alp" ...
##   .. .. .. ..$ : chr [1:20] "Ur" "Us" "alp" "n" ...
##   .. ..- attr(*, "term.labels")= chr [1:20] "Ur" "Us" "alp" "n" ...
##   .. ..- attr(*, "order")= int [1:20] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= num 0
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(prod, Ur, Us, alp, n, I, Ui, S, ph, p, k, ca, mg, al, ctc, sat,      mo, arg, are, cas, acc)
##   .. ..- attr(*, "dataClasses")= Named chr [1:21] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:21] "prod" "Ur" "Us" "alp" ...
##  - attr(*, "class")= chr [1:2] "randomForest.formula" "randomForest"
# Número de vezes que cada variável foi usada em cada bag.
head(rf$inbag)
##   [,1] [,2] [,3]
## 1    2    0    1
## 2    1    0    1
## 3    2    0    2
## 4    1    1    3
## 5    1    1    0
## 6    1    1    0
# Valores preditos.
sort(unique(predict(rf)))
##  [1]  25.48482  28.89996  44.40381  56.02661  59.11806  62.78211  63.90061
##  [8]  68.94633  71.36430  71.57095  79.79300  84.33689  91.26764  92.09106
## [15]  98.10957  99.46530 100.46795 105.44807 108.87186 121.28472 121.47778
## [22] 123.38324 130.45181 136.58047
# Inspecionando uma das árvores da floresta.
t <- 1
one_tree <- getTree(rf, k = t, labelVar = TRUE)
nrow(one_tree)
## [1] 31
rf$forest$ndbigtree[t]
## [1] 31
one_tree
##    left daughter right daughter split var split point status prediction
## 1              2              3        ca   3.1450000     -3   88.61266
## 2              4              5         I   0.6043793     -3   41.46031
## 3              6              7       acc 438.7700000     -3  108.02833
## 4              8              9        Ui   0.4772272     -3   48.19479
## 5             10             11         I   0.6669049     -3   36.40944
## 6              0              0      <NA>   0.0000000     -1  170.76465
## 7             12             13        ph   6.8500000     -3   99.66349
## 8              0              0      <NA>   0.0000000     -1   29.94343
## 9              0              0      <NA>   0.0000000     -1   66.44616
## 10             0              0      <NA>   0.0000000     -1   22.58625
## 11            14             15        Ui   0.4738361     -3   41.01717
## 12            16             17       are 711.5000000     -3   90.98453
## 13            18             19         p  52.9000000     -3  114.65443
## 14             0              0      <NA>   0.0000000     -1   47.97021
## 15             0              0      <NA>   0.0000000     -1   37.54065
## 16            20             21         p   2.5350000     -3   94.54145
## 17            22             23       alp  -0.7142416     -3   83.27785
## 18            24             25        Ur   0.1850338     -3  117.94994
## 19             0              0      <NA>   0.0000000     -1   81.69934
## 20             0              0      <NA>   0.0000000     -1  109.64509
## 21            26             27        Us   0.7230838     -3   91.79534
## 22             0              0      <NA>   0.0000000     -1   68.22631
## 23             0              0      <NA>   0.0000000     -1   90.80361
## 24             0              0      <NA>   0.0000000     -1  105.44807
## 25             0              0      <NA>   0.0000000     -1  130.45181
## 26            28             29       cas  32.0500000     -3   93.02273
## 27             0              0      <NA>   0.0000000     -1   79.52139
## 28            30             31        ph   6.4500000     -3   88.72769
## 29             0              0      <NA>   0.0000000     -1   99.46530
## 30             0              0      <NA>   0.0000000     -1   70.95099
## 31             0              0      <NA>   0.0000000     -1   92.28303
# Outra forma de acessar as variáveis usadas.
rf$forest$bestvar[, t]
##  [1] 11  5 20  6  5  0  8  0  0  0  6 18  9  0  0  9  3  1  0  0  2  0  0
## [24]  0  0 19  0  8  0  0  0  0  0  0  0
names(rf$forest$xlevels)[rf$forest$bestvar[, t]]
##  [1] "ca"  "I"   "acc" "Ui"  "I"   "ph"  "Ui"  "are" "p"   "p"   "alp"
## [12] "Ur"  "Us"  "cas" "ph"
#-----------------------------------------------------------------------
# help(package = "randomForest", help_type = "html")

rf <- randomForest(prod ~ .,
                   data = teca,
                   ntree = B,
                   mtry = nv)
rf
## 
## Call:
##  randomForest(formula = prod ~ ., data = teca, ntree = B, mtry = nv) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 632.4727
##                     % Var explained: 51.99
# Valores preditos.
yp <- predict(rf)

# Correlação entre observado e predito.
cor(yp, teca$prod)
## [1] 0.7210519
xyplot(ym + yp ~ teca$prod,
       aspect = "iso",
       auto.key = TRUE,
       type = c("p", "smooth")) +
    layer(panel.abline(a = 0, b = 1))

# ATTENTION: a randomForest() criou árvores mais profundas que o código
# didático feito algumas linhas acima. Além do mais, as `m` variáveis
# são sorteadas a cada split e não uma vez apenas.

# o <- order(yp)
# xyplot(ym[o] + yp[o] ~ seq_along(ym), auto.key = TRUE)

Para construir o gráfico de uma árvore, leia esse post: https://stats.stackexchange.com/questions/41443/how-to-actually-plot-a-sample-tree-from-randomforestgettree.

#-----------------------------------------------------------------------
# Dados de imóveis.

nv <- floor(sqrt(ncol(ap) - 1))
nv
## [1] 2
ap2 <- na.omit(ap)

rf <- randomForest(lpreco ~ .,
                   data = ap2,
                   ntree = 100,
                   mtry = nv)
rf
## 
## Call:
##  randomForest(formula = lpreco ~ ., data = ap2, ntree = 100, mtry = nv) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 0.007040021
##                     % Var explained: 90.87
# Valores preditos.
yp <- predict(rf)

cor(yp, ap2$lpreco)
## [1] 0.9533648
# Correlação entre observado e predito.
xyplot(yp ~ ap2$lpreco,
       aspect = "iso",
       auto.key = TRUE,
       type = c("p", "smooth")) +
    layer(panel.abline(a = 0, b = 1))

importance(rf, type = 2)
##           IncNodePurity
## quartos        7.321537
## banheiros     12.829150
## vagas         50.273498
## suites        29.342710
## bairro         6.591223
## larea         86.955501
im <- importance(rf, type = 2)
100 * im/sum(im)
##           IncNodePurity
## quartos        3.787388
## banheiros      6.636444
## vagas         26.006185
## suites        15.178812
## bairro         3.409601
## larea         44.981570
varImpPlot(rf)

1.5 Árvore de regressão com boosting

Veja as ilustrações aqui para entender: https://medium.com/mlreview/gradient-boosting-from-scratch-1e317ae4587d.

Visite também gradiente descendente boosting.

library(gbm)

# help(package = "gbm", help_type = "html")

rb <- gbm(lpreco ~ .,
          data = ap,
          distribution = "gaussian",
          n.trees = 100,
          shrinkage = 0.05,
          interaction.depth = 1,
          train.fraction = 0.7,
          n.minobsinnode = 10,
          cv.folds = 3,
          keep.data = TRUE,
          verbose = FALSE,
          n.cores = 1)

rb
## gbm(formula = lpreco ~ ., distribution = "gaussian", data = ap, 
##     n.trees = 100, interaction.depth = 1, n.minobsinnode = 10, 
##     shrinkage = 0.05, train.fraction = 0.7, cv.folds = 3, keep.data = TRUE, 
##     verbose = FALSE, n.cores = 1)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## The best cross-validation iteration was 100.
## The best test-set iteration was 100.
## There were 6 predictors of which 5 had non-zero influence.
summary(rb)

##                 var    rel.inf
## larea         larea 54.0322474
## vagas         vagas 27.3236736
## suites       suites 17.0719486
## bairro       bairro  1.2286939
## banheiros banheiros  0.3434365
## quartos     quartos  0.0000000
# Valores preditos.
yp <- predict(rb)
## Using 100 trees...
# Correlação entre observado e predito.
cor(yp, ap$lpreco)
## [1] 0.930845
xyplot(yp ~ ap$lpreco,
       aspect = "iso",
       auto.key = TRUE,
       type = c("p", "smooth")) +
    latticeExtra::layer(panel.abline(a = 0, b = 1))

2 Métodos para árvores de classificação

Próxima aula.

25px