Ajuste de modelos de regressão linear
Walmes Zeviani
Definições da sessão
##-----------------------------------------------------------------------------
## Definições da sessão, pacotes a serem usados.
pkg <- c("lattice", "latticeExtra", "reshape", "car", "alr3",
"plyr", "wzRfun")
sapply(pkg, library, character.only=TRUE, logical.return=TRUE)
## lattice latticeExtra reshape car alr3 plyr
## TRUE TRUE TRUE TRUE TRUE TRUE
## wzRfun
## TRUE
source("lattice_setup.R")
##-----------------------------------------------------------------------------
## Informações sobre as versões dos pacotes.
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: i686-pc-linux-gnu (32-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=pt_BR.UTF-8
## [4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=pt_BR.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=pt_BR.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets base
##
## other attached packages:
## [1] wzRfun_0.1 plyr_1.8.1 alr3_2.0.5 car_2.0-20
## [5] reshape_0.8.5 latticeExtra_0.6-26 RColorBrewer_1.0-5 lattice_0.20-29
## [9] knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.5 formatR_0.10 grid_3.1.1 MASS_7.3-33 methods_3.1.1
## [6] nnet_7.3-8 Rcpp_0.11.1 stringr_0.6.2 tools_3.1.1
## obs: Para instalar um pacote faça:
## install.packages("nome_do_pacote", dependencies=TRUE)
Modelo para relação altura e diâmetro à altura do peito
##-----------------------------------------------------------------------------
## Dados de inventário florestal.
dap <- read.table("http://www.leg.ufpr.br/~walmes/data/dap.txt",
header=TRUE, sep="\t")
str(dap)
## 'data.frame': 991 obs. of 2 variables:
## $ DAP: num 4.87 7.38 5.95 5.73 6.4 ...
## $ HT : num 9.5 9.8 13 13.5 13.5 13.5 13.5 14.3 14.8 14.8 ...
## Renomeia.
names(dap) <- c("d","h")
##-----------------------------------------------------------------------------
## Visualização.
xyplot(h~d, data=dap, type=c("p","smooth"))

##-----------------------------------------------------------------------------
## Frequência de obsevações ausentes.
sum(!is.na(dap$d))
## [1] 991
sum(!is.na(dap$h))
## [1] 223
head(dap)
## d h
## 1 4.870 9.5
## 2 7.385 9.8
## 3 5.952 13.0
## 4 5.730 13.5
## 5 6.398 13.5
## 6 6.748 13.5
tail(dap)
## d h
## 986 8.785 NA
## 987 19.481 NA
## 988 19.226 NA
## 989 9.040 NA
## 990 13.560 NA
## 991 10.154 NA
## Foram medidas 991 árvores, mas em apenas 223 que se tem h e d. Nas
## restantes só o d. Quer-se estimar a h a partir dos pares conhecidos.
##-----------------------------------------------------------------------------
## Criando novas variáveis para explorar um modelo com boa previsão de
## valores de h a partir de valores de d. Usar procedimento stepwise.
dap <- transform(dap,
d2=d^2, ## quadrado
d3=d^3, ## cubo
dr=sqrt(d), ## raíz
dl=log(d), ## ln
di=1/d, ## recíproco
di2=1/d^2) ## recíproco ao quadrado
str(dap)
## 'data.frame': 991 obs. of 8 variables:
## $ d : num 4.87 7.38 5.95 5.73 6.4 ...
## $ h : num 9.5 9.8 13 13.5 13.5 13.5 13.5 14.3 14.8 14.8 ...
## $ d2 : num 23.7 54.5 35.4 32.8 40.9 ...
## $ d3 : num 116 403 211 188 262 ...
## $ dr : num 2.21 2.72 2.44 2.39 2.53 ...
## $ dl : num 1.58 2 1.78 1.75 1.86 ...
## $ di : num 0.205 0.135 0.168 0.175 0.156 ...
## $ di2: num 0.0422 0.0183 0.0282 0.0305 0.0244 ...
##-----------------------------------------------------------------------------
## As relações marginais.
## splom(dap)
par(mfrow=c(2,4))
for(i in c(1,3:8)){
plot(dap[,"h"]~dap[,i], main=names(dap)[i])
}
layout(1)

##-----------------------------------------------------------------------------
## Ordena os dados e deixa apenas os registros completos (linha cheia).
## Ordenar para evitar efeito espaguete quando plotar.
dap <- dap[order(dap$d),]
## Nova base que contém d e h observados.
dapcc <- dap[complete.cases(dap),]
## reseta o nome das linhas
rownames(dapcc) <- NULL
head(dapcc)
## d h d2 d3 dr dl di di2
## 1 4.870 9.5 23.72 115.5 2.207 1.583 0.2053 0.04216
## 2 5.730 13.5 32.83 188.1 2.394 1.746 0.1745 0.03046
## 3 5.952 13.0 35.43 210.9 2.440 1.784 0.1680 0.02822
## 4 6.334 15.5 40.12 254.2 2.517 1.846 0.1579 0.02492
## 5 6.398 13.5 40.93 261.9 2.529 1.856 0.1563 0.02443
## 6 6.748 13.5 45.54 307.3 2.598 1.909 0.1482 0.02196
## tail(dapcc)
## str(dapcc)
##-----------------------------------------------------------------------------
## Ajuste de modelos.
## Linear simples.
m0 <- lm(h~d, data=dapcc)
summary(m0)
##
## Call:
## lm(formula = h ~ d, data = dapcc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.454 -1.268 0.048 1.333 11.420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.8998 0.5376 22.1 <2e-16 ***
## d 0.7424 0.0297 25.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.3 on 221 degrees of freedom
## Multiple R-squared: 0.739, Adjusted R-squared: 0.738
## F-statistic: 625 on 1 and 221 DF, p-value: <2e-16
## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(m0)~d, dapcc, col=2)
plot(m0)

layout(1)
## Apresenta falta de ajuste.
## Quadrático.
m1 <- lm(h~d+d2, data=dapcc)
summary(m1)
##
## Call:
## lm(formula = h ~ d + d2, data = dapcc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.311 -1.128 0.062 1.210 11.086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.48684 1.31133 3.42 0.00074 ***
## d 1.76152 0.16902 10.42 < 2e-16 ***
## d2 -0.03130 0.00512 -6.11 4.4e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.14 on 220 degrees of freedom
## Multiple R-squared: 0.777, Adjusted R-squared: 0.775
## F-statistic: 383 on 2 and 220 DF, p-value: <2e-16
## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(m1)~d, dapcc, col=2)
plot(m1)

layout(1)
## Cúbico.
m2 <- lm(h~d+d2+d3, data=dapcc)
summary(m2)
##
## Call:
## lm(formula = h ~ d + d2 + d3, data = dapcc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.634 -1.118 0.041 1.189 10.613
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.044684 3.004680 -2.01 0.04547 *
## d 4.093204 0.624615 6.55 4.0e-10 ***
## d2 -0.186500 0.040424 -4.61 6.7e-06 ***
## d3 0.003194 0.000826 3.87 0.00014 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.07 on 219 degrees of freedom
## Multiple R-squared: 0.791, Adjusted R-squared: 0.788
## F-statistic: 276 on 3 and 219 DF, p-value: <2e-16
anova(m2)
## Analysis of Variance Table
##
## Response: h
## Df Sum Sq Mean Sq F value Pr(>F)
## d 1 3320 3320 774.1 < 2e-16 ***
## d2 1 170 170 39.7 1.6e-09 ***
## d3 1 64 64 15.0 0.00014 ***
## Residuals 219 939 4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(m2)~d, dapcc, col=2)
plot(m2)

layout(1)
##-----------------------------------------------------------------------------
## Cria a lista com as fórmulas dos modelos, 7 no total.
formulas <- list(linear=h~d,
quadratico=h~d+d2,
cubico=h~d+d2+d3,
reciproco=h~d+di,
reciproco2=h~d+di2,
raiz=h~d+dr,
log=h~d+dl)
formulas
## $linear
## h ~ d
##
## $quadratico
## h ~ d + d2
##
## $cubico
## h ~ d + d2 + d3
##
## $reciproco
## h ~ d + di
##
## $reciproco2
## h ~ d + di2
##
## $raiz
## h ~ d + dr
##
## $log
## h ~ d + dl
##-----------------------------------------------------------------------------
## Com a lapply(), o ajuste será feito usando cada uma das fórmulas da
## lista.
## Todos os ajustes.
ajustes <- lapply(formulas, lm, data=dapcc)
names(ajustes)
## [1] "linear" "quadratico" "cubico" "reciproco" "reciproco2" "raiz"
## [7] "log"
## Todas as tabelas com estimativas de medidas de ajuste.
lapply(ajustes, summary)
## $linear
##
## Call:
## FUN(formula = X[[1L]], data = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.454 -1.268 0.048 1.333 11.420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.8998 0.5376 22.1 <2e-16 ***
## d 0.7424 0.0297 25.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.3 on 221 degrees of freedom
## Multiple R-squared: 0.739, Adjusted R-squared: 0.738
## F-statistic: 625 on 1 and 221 DF, p-value: <2e-16
##
##
## $quadratico
##
## Call:
## FUN(formula = X[[2L]], data = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.311 -1.128 0.062 1.210 11.086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.48684 1.31133 3.42 0.00074 ***
## d 1.76152 0.16902 10.42 < 2e-16 ***
## d2 -0.03130 0.00512 -6.11 4.4e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.14 on 220 degrees of freedom
## Multiple R-squared: 0.777, Adjusted R-squared: 0.775
## F-statistic: 383 on 2 and 220 DF, p-value: <2e-16
##
##
## $cubico
##
## Call:
## FUN(formula = X[[3L]], data = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.634 -1.118 0.041 1.189 10.613
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.044684 3.004680 -2.01 0.04547 *
## d 4.093204 0.624615 6.55 4.0e-10 ***
## d2 -0.186500 0.040424 -4.61 6.7e-06 ***
## d3 0.003194 0.000826 3.87 0.00014 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.07 on 219 degrees of freedom
## Multiple R-squared: 0.791, Adjusted R-squared: 0.788
## F-statistic: 276 on 3 and 219 DF, p-value: <2e-16
##
##
## $reciproco
##
## Call:
## FUN(formula = X[[4L]], data = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.552 -1.138 0.007 1.226 10.648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.8607 1.9183 12.96 < 2e-16 ***
## d 0.3162 0.0667 4.74 3.8e-06 ***
## di -85.0770 12.1784 -6.99 3.3e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.09 on 220 degrees of freedom
## Multiple R-squared: 0.786, Adjusted R-squared: 0.784
## F-statistic: 405 on 2 and 220 DF, p-value: <2e-16
##
##
## $reciproco2
##
## Call:
## FUN(formula = X[[5L]], data = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.722 -1.088 0.059 1.232 10.630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.6752 0.9875 17.90 < 2e-16 ***
## d 0.4951 0.0456 10.85 < 2e-16 ***
## di2 -291.6728 43.2847 -6.74 1.4e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared: 0.783, Adjusted R-squared: 0.782
## F-statistic: 398 on 2 and 220 DF, p-value: <2e-16
##
##
## $raiz
##
## Call:
## FUN(formula = X[[6L]], data = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.360 -1.131 0.044 1.167 10.831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.075 4.283 -3.99 9.1e-05 ***
## d -1.216 0.289 -4.21 3.7e-05 ***
## dr 15.313 2.249 6.81 9.2e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared: 0.784, Adjusted R-squared: 0.782
## F-statistic: 400 on 2 and 220 DF, p-value: <2e-16
##
##
## $log
##
## Call:
## FUN(formula = X[[7L]], data = ..1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.410 -1.115 0.008 1.190 10.754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.138 3.355 -3.32 0.0011 **
## d -0.203 0.139 -1.46 0.1454
## dl 14.095 2.031 6.94 4.3e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.09 on 220 degrees of freedom
## Multiple R-squared: 0.786, Adjusted R-squared: 0.784
## F-statistic: 403 on 2 and 220 DF, p-value: <2e-16
## Todos os gráficos de resíduos vs ajustados.
par(mfrow=c(2,4))
lapply(ajustes, plot, which=1)
## $linear
## NULL
##
## $quadratico
## NULL
##
## $cubico
## NULL
##
## $reciproco
## NULL
##
## $reciproco2
## NULL
##
## $raiz
## NULL
##
## $log
## NULL
layout(1)

## Extraindo o valor de R² ajustado de cada modelo.
r2 <- sapply(ajustes, function(a){
summary(a)$adj.r.squared
})
## Valores do R².
cbind(r2)
## r2
## linear 0.7376
## quadratico 0.7747
## cubico 0.7881
## reciproco 0.7843
## reciproco2 0.7815
## raiz 0.7823
## log 0.7838
## Gráficos.
## barplot(r2, ylim=c(0,1))
barplot(r2, ylim=c(0.7,0.8), xpd=FALSE)
box(bty="l")

##-----------------------------------------------------------------------------
## Correndo um stepwise.
## Modelo com todas as variáveis.
m7 <- lm(h~., data=dapcc)
summary(m7)
##
## Call:
## lm(formula = h ~ ., data = dapcc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.148 -1.107 0.062 1.123 11.075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.41e+03 7.91e+03 1.19 0.236
## d -8.32e+03 3.93e+03 -2.12 0.035 *
## d2 5.55e+01 2.52e+01 2.20 0.029 *
## d3 -3.35e-01 1.48e-01 -2.26 0.025 *
## dr 9.81e+04 4.74e+04 2.07 0.040 *
## dl -9.76e+04 4.85e+04 -2.01 0.045 *
## di -1.79e+05 9.43e+04 -1.90 0.059 .
## di2 1.11e+05 6.25e+04 1.78 0.077 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.05 on 215 degrees of freedom
## Multiple R-squared: 0.799, Adjusted R-squared: 0.793
## F-statistic: 122 on 7 and 215 DF, p-value: <2e-16
## obs: o ponto "." indica "todas as variáveis restante do arquivo".
## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc); lines(fitted(m7)~d, dapcc, col=2)
plot(m7)
## Warning: NaNs produced
## Warning: NaNs produced

layout(1)
##-----------------------------------------------------------------------------
## Seleção de modelos/variáveis (both, forward, backward).
## Por padrão é o AIC: ll-2*p. Peso para os parâmetros é 2.
step(m7, direction="both")
## Start: AIC=327.9
## h ~ d + d2 + d3 + dr + dl + di + di2
##
## Df Sum of Sq RSS AIC
## <none> 903 328
## - di2 1 13.3 916 329
## - di 1 15.1 918 330
## - dl 1 17.0 920 330
## - dr 1 18.0 921 330
## - d 1 18.9 922 330
## - d2 1 20.4 923 331
## - d3 1 21.5 924 331
##
## Call:
## lm(formula = h ~ d + d2 + d3 + dr + dl + di + di2, data = dapcc)
##
## Coefficients:
## (Intercept) d d2 d3 dr dl
## 9.41e+03 -8.32e+03 5.55e+01 -3.35e-01 9.81e+04 -9.76e+04
## di di2
## -1.79e+05 1.11e+05
## logLik(m7)-2*length(coef(m7))
## Assim é o BIC: ll-log(n)*p. Peso para os parâmetros é log(n).
step(m7, direction="both", k=log(nrow(dapcc)))
## Start: AIC=355.1
## h ~ d + d2 + d3 + dr + dl + di + di2
##
## Df Sum of Sq RSS AIC
## - di2 1 13.3 916 353
## - di 1 15.1 918 353
## - dl 1 17.0 920 354
## - dr 1 18.0 921 354
## - d 1 18.9 922 354
## - d2 1 20.4 923 355
## - d3 1 21.5 924 355
## <none> 903 355
##
## Step: AIC=353
## h ~ d + d2 + d3 + dr + dl + di
##
## Df Sum of Sq RSS AIC
## - d3 1 16.5 933 352
## - d2 1 18.2 934 352
## - d 1 19.4 936 352
## - dr 1 19.8 936 352
## - di 1 19.8 936 352
## - dl 1 19.9 936 352
## <none> 916 353
## + di2 1 13.3 903 355
##
## Step: AIC=351.5
## h ~ d + d2 + dr + dl + di
##
## Df Sum of Sq RSS AIC
## - di 1 3.73 936 347
## - dl 1 5.22 938 347
## - dr 1 6.27 939 348
## - d 1 7.31 940 348
## - d2 1 9.48 942 348
## <none> 933 352
## + d3 1 16.46 916 353
## + di2 1 8.26 924 355
##
## Step: AIC=347
## h ~ d + d2 + dr + dl
##
## Df Sum of Sq RSS AIC
## - dl 1 13.63 950 345
## - dr 1 16.95 953 346
## - d 1 18.54 955 346
## - d2 1 20.80 957 346
## <none> 936 347
## + di2 1 4.56 932 351
## + di 1 3.73 933 352
## + d3 1 0.43 936 352
##
## Step: AIC=344.8
## h ~ d + d2 + dr
##
## Df Sum of Sq RSS AIC
## - d2 1 19.4 969 344
## <none> 950 345
## - d 1 30.8 981 347
## + d3 1 13.9 936 347
## + dl 1 13.6 936 347
## + di 1 12.1 938 347
## + di2 1 10.3 940 348
## - dr 1 53.3 1003 352
##
## Step: AIC=343.9
## h ~ d + dr
##
## Df Sum of Sq RSS AIC
## <none> 969 344
## + d3 1 22.4 947 344
## + d2 1 19.4 950 345
## + dl 1 12.2 957 346
## + di 1 9.0 960 347
## + di2 1 6.4 963 348
## - d 1 78.1 1047 356
## - dr 1 204.3 1174 381
##
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
##
## Coefficients:
## (Intercept) d dr
## -17.08 -1.22 15.31
## Uso um peso arbitrário.
step(m7, direction="both", k=12)
## Start: AIC=407.9
## h ~ d + d2 + d3 + dr + dl + di + di2
##
## Df Sum of Sq RSS AIC
## - di2 1 13.3 916 399
## - di 1 15.1 918 400
## - dl 1 17.0 920 400
## - dr 1 18.0 921 400
## - d 1 18.9 922 400
## - d2 1 20.4 923 401
## - d3 1 21.5 924 401
## <none> 903 408
##
## Step: AIC=399.1
## h ~ d + d2 + d3 + dr + dl + di
##
## Df Sum of Sq RSS AIC
## - d3 1 16.5 933 391
## - d2 1 18.2 934 391
## - d 1 19.4 936 392
## - dr 1 19.8 936 392
## - di 1 19.8 936 392
## - dl 1 19.9 936 392
## <none> 916 399
## + di2 1 13.3 903 408
##
## Step: AIC=391.1
## h ~ d + d2 + dr + dl + di
##
## Df Sum of Sq RSS AIC
## - di 1 3.73 936 380
## - dl 1 5.22 938 380
## - dr 1 6.27 939 381
## - d 1 7.31 940 381
## - d2 1 9.48 942 381
## <none> 933 391
## + d3 1 16.46 916 399
## + di2 1 8.26 924 401
##
## Step: AIC=380
## h ~ d + d2 + dr + dl
##
## Df Sum of Sq RSS AIC
## - dl 1 13.63 950 371
## - dr 1 16.95 953 372
## - d 1 18.54 955 372
## - d2 1 20.80 957 373
## <none> 936 380
## + di2 1 4.56 932 391
## + di 1 3.73 933 391
## + d3 1 0.43 936 392
##
## Step: AIC=371.2
## h ~ d + d2 + dr
##
## Df Sum of Sq RSS AIC
## - d2 1 19.4 969 364
## - d 1 30.8 981 366
## <none> 950 371
## - dr 1 53.3 1003 371
## + d3 1 13.9 936 380
## + dl 1 13.6 936 380
## + di 1 12.1 938 380
## + di2 1 10.3 940 381
##
## Step: AIC=363.7
## h ~ d + dr
##
## Df Sum of Sq RSS AIC
## <none> 969 364
## - d 1 78.1 1047 369
## + d3 1 22.4 947 370
## + d2 1 19.4 950 371
## + dl 1 12.2 957 373
## + di 1 9.0 960 374
## + di2 1 6.4 963 374
## - dr 1 204.3 1174 394
##
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
##
## Coefficients:
## (Intercept) d dr
## -17.08 -1.22 15.31
##-----------------------------------------------------------------------------
## Modelo final.
mfin <- step(m7, direction="both", k=log(nrow(dapcc)))
## Start: AIC=355.1
## h ~ d + d2 + d3 + dr + dl + di + di2
##
## Df Sum of Sq RSS AIC
## - di2 1 13.3 916 353
## - di 1 15.1 918 353
## - dl 1 17.0 920 354
## - dr 1 18.0 921 354
## - d 1 18.9 922 354
## - d2 1 20.4 923 355
## - d3 1 21.5 924 355
## <none> 903 355
##
## Step: AIC=353
## h ~ d + d2 + d3 + dr + dl + di
##
## Df Sum of Sq RSS AIC
## - d3 1 16.5 933 352
## - d2 1 18.2 934 352
## - d 1 19.4 936 352
## - dr 1 19.8 936 352
## - di 1 19.8 936 352
## - dl 1 19.9 936 352
## <none> 916 353
## + di2 1 13.3 903 355
##
## Step: AIC=351.5
## h ~ d + d2 + dr + dl + di
##
## Df Sum of Sq RSS AIC
## - di 1 3.73 936 347
## - dl 1 5.22 938 347
## - dr 1 6.27 939 348
## - d 1 7.31 940 348
## - d2 1 9.48 942 348
## <none> 933 352
## + d3 1 16.46 916 353
## + di2 1 8.26 924 355
##
## Step: AIC=347
## h ~ d + d2 + dr + dl
##
## Df Sum of Sq RSS AIC
## - dl 1 13.63 950 345
## - dr 1 16.95 953 346
## - d 1 18.54 955 346
## - d2 1 20.80 957 346
## <none> 936 347
## + di2 1 4.56 932 351
## + di 1 3.73 933 352
## + d3 1 0.43 936 352
##
## Step: AIC=344.8
## h ~ d + d2 + dr
##
## Df Sum of Sq RSS AIC
## - d2 1 19.4 969 344
## <none> 950 345
## - d 1 30.8 981 347
## + d3 1 13.9 936 347
## + dl 1 13.6 936 347
## + di 1 12.1 938 347
## + di2 1 10.3 940 348
## - dr 1 53.3 1003 352
##
## Step: AIC=343.9
## h ~ d + dr
##
## Df Sum of Sq RSS AIC
## <none> 969 344
## + d3 1 22.4 947 344
## + d2 1 19.4 950 345
## + dl 1 12.2 957 346
## + di 1 9.0 960 347
## + di2 1 6.4 963 348
## - d 1 78.1 1047 356
## - dr 1 204.3 1174 381
summary(mfin)
##
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.360 -1.131 0.044 1.167 10.831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.075 4.283 -3.99 9.1e-05 ***
## d -1.216 0.289 -4.21 3.7e-05 ***
## dr 15.313 2.249 6.81 9.2e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared: 0.784, Adjusted R-squared: 0.782
## F-statistic: 400 on 2 and 220 DF, p-value: <2e-16
## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc)
lines(fitted(mfin)~d, dapcc, col=2)
plot(mfin)

layout(1)
##-----------------------------------------------------------------------------
## Medidas de influência das observações.
inf <- influence.measures(mfin)
summary(inf)
## Potentially influential observations of
## lm(formula = h ~ d + dr, data = dapcc) :
##
## dfb.1_ dfb.d dfb.dr dffit cov.r cook.d hat
## 1 -0.25 -0.22 0.24 -0.27 1.17_* 0.02 0.14_*
## 2 0.12 0.11 -0.12 0.14 1.11_* 0.01 0.09_*
## 3 -0.01 0.00 0.01 -0.01 1.10_* 0.00 0.08_*
## 4 0.19 0.16 -0.18 0.22 1.07_* 0.02 0.06_*
## 5 -0.04 -0.03 0.04 -0.05 1.08_* 0.00 0.06_*
## 6 -0.09 -0.08 0.08 -0.11 1.07_* 0.00 0.05_*
## 7 -0.03 -0.03 0.03 -0.04 1.07_* 0.00 0.05_*
## 8 -0.12 -0.10 0.11 -0.15 1.06_* 0.01 0.05_*
## 9 0.10 0.08 -0.09 0.12 1.06_* 0.00 0.05_*
## 10 -0.41 -0.33 0.37 -0.56_* 0.94_* 0.10 0.04
## 11 -0.04 -0.03 0.04 -0.05 1.05_* 0.00 0.04
## 15 0.38 0.25 -0.31 0.78_* 0.71_* 0.18 0.02
## 29 -0.02 0.03 -0.01 -0.25 0.96_* 0.02 0.01
## 41 -0.17 -0.26 0.23 0.58_* 0.69_* 0.10 0.01
## 160 0.02 0.00 -0.01 -0.20 0.94_* 0.01 0.01
## 194 -0.08 -0.11 0.09 -0.27 0.93_* 0.02 0.01
## 209 -0.25 -0.31 0.28 -0.50_* 0.87_* 0.08 0.02
## 221 0.03 0.04 -0.04 0.05 1.05_* 0.00 0.03
## 222 0.13 0.15 -0.14 0.19 1.04_* 0.01 0.04
## 223 -0.19 -0.22 0.20 -0.26 1.05_* 0.02 0.05_*
## Sinalizando os pontos influentes.
str(inf)
## List of 3
## $ infmat: num [1:223, 1:7] -0.24888 0.12312 -0.00557 0.19068 -0.04001 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:223] "1" "2" "3" "4" ...
## .. ..$ : chr [1:7] "dfb.1_" "dfb.d" "dfb.dr" "dffit" ...
## $ is.inf: logi [1:223, 1:7] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:223] "1" "2" "3" "4" ...
## .. ..$ : chr [1:7] "dfb.1_" "dfb.d" "dfb.dr" "dffit" ...
## $ call : language lm(formula = h ~ d + dr, data = dapcc)
## - attr(*, "class")= chr "infl"
## Influentes pelo DFITS (influência sobre ajuste).
dfits <- inf$is.inf[,4]
plot(h~d, dapcc)
lines(fitted(mfin)~d, dapcc, col=2)
with(dapcc, points(d[dfits], h[dfits], col=2, pch=19))

##-----------------------------------------------------------------------------
## Identificar/remover os pontos discrepantes/influentes manualmente
## (critério visual).
plot(residuals(mfin)~d, dapcc)

## id <- identify(dapcc$d, residuals(mfin))
## id <- c(10L, 15L, 41L, 209L)
id <- which(dfits)
## Refazer a análise com os pontos removidos.
dapcc2 <- dapcc[-id,]
str(dapcc2)
## 'data.frame': 219 obs. of 8 variables:
## $ d : num 4.87 5.73 5.95 6.33 6.4 ...
## $ h : num 9.5 13.5 13 15.5 13.5 13.5 14.3 13.5 16 15 ...
## $ d2 : num 23.7 32.8 35.4 40.1 40.9 ...
## $ d3 : num 116 188 211 254 262 ...
## $ dr : num 2.21 2.39 2.44 2.52 2.53 ...
## $ dl : num 1.58 1.75 1.78 1.85 1.86 ...
## $ di : num 0.205 0.175 0.168 0.158 0.156 ...
## $ di2: num 0.0422 0.0305 0.0282 0.0249 0.0244 ...
mfinb <- lm(h~d+dr, data=dapcc2)
summary(mfinb)
##
## Call:
## lm(formula = h ~ d + dr, data = dapcc2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.615 -1.032 0.118 1.232 3.994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.210 3.585 -4.24 3.3e-05 ***
## d -1.034 0.241 -4.29 2.7e-05 ***
## dr 14.081 1.878 7.50 1.7e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.73 on 216 degrees of freedom
## Multiple R-squared: 0.846, Adjusted R-squared: 0.844
## F-statistic: 593 on 2 and 216 DF, p-value: <2e-16
summary(mfin)
##
## Call:
## lm(formula = h ~ d + dr, data = dapcc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.360 -1.131 0.044 1.167 10.831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.075 4.283 -3.99 9.1e-05 ***
## d -1.216 0.289 -4.21 3.7e-05 ***
## dr 15.313 2.249 6.81 9.2e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.1 on 220 degrees of freedom
## Multiple R-squared: 0.784, Adjusted R-squared: 0.782
## F-statistic: 400 on 2 and 220 DF, p-value: <2e-16
## Resultado.
layout(matrix(c(1,1,2,3,4,5),2,3))
plot(h~d, dapcc2)
lines(fitted(mfinb)~d, dapcc2, col=2) ## modelo com as remoções
lines(fitted(mfin)~d, dapcc, col=3) ## modelo com todas observações
plot(mfinb)

layout(1)
##-----------------------------------------------------------------------------
## Denconfia da normalidade por causa do leve arqueamento dos resíduos
## no qqnorm? Devemos transformar? Qual transformação usar?
plot(mfinb, which=2)

MASS::boxcox(mfinb, lambda=seq(0, 2, l=100)) # intervalo contém o 1?

## Qual o resultado dos testes?
shapiro.test(residuals(mfinb))
##
## Shapiro-Wilk normality test
##
## data: residuals(mfinb)
## W = 0.9893, p-value = 0.1028
shapiro.test(rstudent(mfinb))
##
## Shapiro-Wilk normality test
##
## data: rstudent(mfinb)
## W = 0.9889, p-value = 0.09033
ks.test(rstudent(mfinb), "pnorm", mean=0, sd=1)
## Warning: ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: rstudent(mfinb)
## D = 0.0426, p-value = 0.8226
## alternative hypothesis: two-sided
ks.test(rstudent(mfinb), "pt", df=nrow(dapcc)-length(coef(mfin)-1))
## Warning: ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: rstudent(mfinb)
## D = 0.0425, p-value = 0.8249
## alternative hypothesis: two-sided
##-----------------------------------------------------------------------------
## Tudo para encontrar o modelo, vamos predizer a artura das árvores e
## salvar num arquivo.
hpred <- predict(mfinb, newdata=dap)
str(hpred)
## Named num [1:991] 2.4 3.74 5.15 6.15 6.15 ...
## - attr(*, "names")= chr [1:991] "960" "870" "616" "261" ...
dap$hpred <- hpred
str(dap)
## 'data.frame': 991 obs. of 9 variables:
## $ d : num 1.94 2.29 2.71 3.02 3.02 ...
## $ h : num NA NA NA NA NA NA NA NA NA NA ...
## $ d2 : num 3.77 5.25 7.32 9.14 9.14 ...
## $ d3 : num 7.32 12.04 19.81 27.65 27.65 ...
## $ dr : num 1.39 1.51 1.64 1.74 1.74 ...
## $ dl : num 0.664 0.829 0.995 1.107 1.107 ...
## $ di : num 0.515 0.436 0.37 0.331 0.331 ...
## $ di2 : num 0.265 0.19 0.137 0.109 0.109 ...
## $ hpred: num 2.4 3.74 5.15 6.15 6.15 ...
## write.table(dap, "dap.xls", sep="\t", quote=FALSE, row.names=FALSE,
## dec=".")
##-----------------------------------------------------------------------------
## Predição.
## range(dapcc2$d)
d.new <- seq(4, 30, length=100)
d.new
## [1] 4.000 4.263 4.525 4.788 5.051 5.313 5.576 5.838 6.101 6.364 6.626 6.889
## [13] 7.152 7.414 7.677 7.939 8.202 8.465 8.727 8.990 9.253 9.515 9.778 10.040
## [25] 10.303 10.566 10.828 11.091 11.354 11.616 11.879 12.141 12.404 12.667 12.929 13.192
## [37] 13.455 13.717 13.980 14.242 14.505 14.768 15.030 15.293 15.556 15.818 16.081 16.343
## [49] 16.606 16.869 17.131 17.394 17.657 17.919 18.182 18.444 18.707 18.970 19.232 19.495
## [61] 19.758 20.020 20.283 20.545 20.808 21.071 21.333 21.596 21.859 22.121 22.384 22.646
## [73] 22.909 23.172 23.434 23.697 23.960 24.222 24.485 24.747 25.010 25.273 25.535 25.798
## [85] 26.061 26.323 26.586 26.848 27.111 27.374 27.636 27.899 28.162 28.424 28.687 28.949
## [97] 29.212 29.475 29.737 30.000
pred <- data.frame(d=d.new, dr=sqrt(d.new))
## Fazendo predição com intervalo de confiança e predição futura.
Yp <- predict(mfinb, newdata=pred,
interval="confidence")
Yf <- predict(mfinb, newdata=pred,
interval="prediction")
colnames(Yf) <- toupper(colnames(Yf))
head(Yp)
## fit lwr upr
## 1 8.816 7.177 10.45
## 2 9.454 7.925 10.98
## 3 10.064 8.637 11.49
## 4 10.650 9.318 11.98
## 5 11.212 9.969 12.45
## 6 11.753 10.593 12.91
pred <- cbind(pred, Yp, Yf)
str(pred)
## 'data.frame': 100 obs. of 8 variables:
## $ d : num 4 4.26 4.53 4.79 5.05 ...
## $ dr : num 2 2.06 2.13 2.19 2.25 ...
## $ fit: num 8.82 9.45 10.06 10.65 11.21 ...
## $ lwr: num 7.18 7.92 8.64 9.32 9.97 ...
## $ upr: num 10.5 11 11.5 12 12.5 ...
## $ FIT: num 8.82 9.45 10.06 10.65 11.21 ...
## $ LWR: num 5.03 5.72 6.37 6.99 7.58 ...
## $ UPR: num 12.6 13.2 13.8 14.3 14.8 ...
## Inclusão da expressão do modelo.
c0 <- coef(mfinb)
co <- format(c(abs(c0), summary(mfinb)$r.squared),
digits=3, trim=TRUE)
sinal <- ifelse(c0<0, "-", "+")
co[seq_along(sinal)] <- paste(sinal, co[seq_along(sinal)], sep="")
l <- c("Predito", "IC predito", "IC obs. futura")
key <- list(#corner=c(0.05,0.95),
lines=list(lty=1),
text=list(l[1]),
rect=list(col=c("red"), alpha=0.1, lty=3),
text=list(l[2]),
rect=list(col=c("blue"), alpha=0.1, lty=3),
text=list(l[3]))
xyplot(h~d, data=dapcc2, xlab="DAP (cm)", ylab="Altura (m)", key=key)+
as.layer(xyplot(fit~d, data=pred, type="l",
ly=pred$lwr, uy=pred$upr, cty="bands", fill="red",
prepanel=prepanel.cbH, panel=panel.cbH))+
as.layer(xyplot(FIT~d, data=pred, type="l",
ly=pred$LWR, uy=pred$UPR, cty="bands", fill="blue",
prepanel=prepanel.cbH, panel=panel.cbH))+
layer(panel.text(
x=20, y=15,
label=substitute(hat(h)==b0~b1*d~b2*sqrt(d)~~~~(R^2==r2),
list(b0=co[1], b1=co[2], b2=co[3], r2=co[4])), bty="n"))
