Aplicação de modelos de regressão linear e não linear em ciências agrárias

09 à 11 de Dezembro de 2014 - Goiânia - GO
Prof. Dr. Walmes M. Zeviani
Embrapa Arroz e Feijão
Lab. de Estatística e Geoinformação - LEG
Departamento de Estatística - UFPR


Modelagem da variância


Modelo de regressão não linear

##=============================================================================
## Aplicação de modelos de regressão linear e
##   não linear em ciências agrárias
##
##   09 à 11 de Dezembro de 2014 - Goiânia/GO
##   Embrapa Arroz e Feijão
## 
##                                                  Prof. Dr. Walmes M. Zeviani
##                                                                LEG/DEST/UFPR
##=============================================================================

##-----------------------------------------------------------------------------
## Definições da sessão.

pkg <- c("lattice", "latticeExtra", "gridExtra", "car", "alr3",
         "plyr", "reshape", "nlme", "segmented", "wzRfun")

sapply(pkg, require, character.only=TRUE)
##      lattice latticeExtra    gridExtra          car         alr3         plyr 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##      reshape         nlme    segmented       wzRfun 
##         TRUE         TRUE         TRUE         TRUE
trellis.device(color=FALSE)
##-----------------------------------------------------------------------------
## Dados de crescimento de goiaba.

goi <- read.table("http://www.leg.ufpr.br/~walmes/data/goiaba.txt",
                  header=TRUE, sep="\t")
goi$daa2 <- goi$daa^2.5
str(goi)
## 'data.frame':    174 obs. of  8 variables:
##  $ daa   : int  13 13 13 13 13 13 13 13 13 13 ...
##  $ coleta: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ rep   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ long  : num  29.1 32.9 36.3 34.7 33.3 ...
##  $ trans : num  20.5 22.8 22.9 22.7 21.4 ...
##  $ peso  : num  6.66 8.51 9.93 9.14 7.6 7.54 6.27 9.94 7.95 7.79 ...
##  $ volume: int  6 9 11 10 7 8 6 9 8 8 ...
##  $ daa2  : num  609 609 609 609 609 ...
## Calcula média e desvio padrão em cada data.
aux <- ddply(goi, .(daa), summarise, m=mean(peso), s=sd(peso))

p1 <- xyplot(peso~daa, data=goi, type=c("p","a"),
             ylab="Peso do fruto (g)", xlab="Dias após antese")
p2 <- xyplot(s~m, aux, type=c("p","r"),
             xlab="Média de peso", ylab="Desvio-padrão de peso")
grid.arrange(p1, p2, nrow=1)

## Na escala log para o peso.

aux <- ddply(goi, .(daa), summarise, m=mean(log(peso)), s=sd(log(peso)))

p1 <- xyplot(log(peso)~daa, data=goi, type=c("p","a"),
             ylab="Peso do fruto (g)", xlab="Dias após antese")
p2 <- xyplot(s~m, aux, type=c("p","r"),
             xlab="Média de peso", ylab="Desvio-padrão de peso")
grid.arrange(p1, p2, nrow=1)

##-----------------------------------------------------------------------------
## Ajustes.

## Modelo tradicional =  suposição de variância constante.
n0 <- gnls(peso~A-(A-D)*exp(-exp(C*(daa-B))), data=goi,
           start=c(A=200, C=0.09, B=105, D=7))
summary(n0)
## Generalized nonlinear least squares fit
##   Model: peso ~ A - (A - D) * exp(-exp(C * (daa - B))) 
##   Data: goi 
##        AIC      BIC    logLik
##   1619.093 1634.888 -804.5465
## 
## Coefficients:
##       Value Std.Error   t-value p-value
## A 205.76239  4.196193  49.03549       0
## C   0.09367  0.010148   9.22980       0
## B 107.35182  0.874391 122.77326       0
## D  18.70608  3.179082   5.88411       0
## 
##  Correlation: 
##   A      C      B     
## C -0.357              
## B  0.590 -0.131       
## D -0.103  0.467  0.219
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -3.17237358 -0.42834311 -0.07757784  0.35077905  3.94396837 
## 
## Residual standard error: 24.94118 
## Degrees of freedom: 174 total; 170 residual
## Modelo com função de variâcia: var(e)=sigma^2*exp(2*delta*daa).
n1 <- update(n0,
             ## weights=varExp(0.03, form=~daa)
             weights=varPower(0.1, form=~log(daa))             
             )
summary(n1)
## Generalized nonlinear least squares fit
##   Model: peso ~ A - (A - D) * exp(-exp(C * (daa - B))) 
##   Data: goi 
##        AIC      BIC    logLik
##   1432.407 1451.362 -710.2036
## 
## Variance function:
##  Structure: Power of variance covariate
##  Formula: ~log(daa) 
##  Parameter estimates:
##    power 
## 5.764348 
## 
## Coefficients:
##       Value Std.Error  t-value p-value
## A 217.15613  8.784140 24.72139       0
## C   0.05520  0.003531 15.63409       0
## B 108.05936  1.880553 57.46148       0
## D   7.49496  0.426361 17.57893       0
## 
##  Correlation: 
##   A      C      B     
## C -0.566              
## B  0.879 -0.624       
## D -0.390  0.837 -0.386
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.66220159 -0.53129856  0.09399151  0.63400182  2.80361091 
## 
## Residual standard error: 0.003829197 
## Degrees of freedom: 174 total; 170 residual
logLik(n0)
## 'log Lik.' -804.5465 (df=5)
logLik(n1)
## 'log Lik.' -710.2036 (df=6)
anova(n1, n0)
##    Model df      AIC      BIC    logLik   Test  L.Ratio p-value
## n1     1  6 1432.407 1451.362 -710.2036                        
## n0     2  5 1619.093 1634.888 -804.5465 1 vs 2 188.6858  <.0001
##-----------------------------------------------------------------------------
## Diagnóstico.

da <- rbind(data.frame(model="trad",
                       r=residuals(n0, type="pearson"),
                       f=fitted(n0)),
            data.frame(model="var",
                       r=residuals(n1, type="pearson"),
                       f=fitted(n1)))

p1 <- qqmath(~r|model, data=da,
             xlab="Quantis teóricos",
             ylab="Resíduos padronizados")
p2 <- xyplot(sqrt(abs(r))~f|model, data=da, type=c("p","smooth"),
             xlab="Valor ajustado",
             ylab=expression(sqrt("|Resíduos padronizados|")))
grid.arrange(p1, p2)

##-----------------------------------------------------------------------------
## Predição a partir dos modelos.

model <- deriv3(~ A-(A-D)*exp(-exp(C*(daa-B))),
                c("A","C","B","D"),
                function(daa, A, C, B, D){NULL})

l <- list(n0, n1)
names(l) <- levels(da$model)
L <- l
for(i in seq_along(L)){
    m4 <- l[[i]]
    U <- chol(vcov(m4))
    pred <- data.frame(daa=seq(1,140,1))
    m <- model(daa=pred$daa, A=coef(m4)["A"], C=coef(m4)["C"],
               B=coef(m4)["B"], D=coef(m4)["D"])
    F0 <- attr(m, "gradient")
    pred$y <- c(m)
    pred$se <- sqrt(apply(F0%*%t(U), 1, function(x) sum(x^2)))
    z <- qnorm(0.975)
    pred <- transform(pred, lwr=y-z*se, upr=y+z*se)
    L[[i]] <- pred
}

str(L)
## List of 2
##  $ trad:'data.frame':    140 obs. of  5 variables:
##   ..$ daa: num [1:140] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ y  : num [1:140] 18.7 18.7 18.7 18.7 18.7 ...
##   ..$ se : num [1:140] 3.17 3.17 3.17 3.17 3.17 ...
##   ..$ lwr: num [1:140] 12.5 12.5 12.5 12.5 12.5 ...
##   ..$ upr: num [1:140] 24.9 24.9 24.9 24.9 24.9 ...
##  $ var :'data.frame':    140 obs. of  5 variables:
##   ..$ daa: num [1:140] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ y  : num [1:140] 8.06 8.1 8.13 8.16 8.2 ...
##   ..$ se : num [1:140] 0.278 0.273 0.267 0.262 0.256 ...
##   ..$ lwr: num [1:140] 7.52 7.56 7.61 7.65 7.7 ...
##   ..$ upr: num [1:140] 8.61 8.63 8.65 8.68 8.7 ...
L <- ldply(L)
names(L)[1] <- "model"
L$model <- factor(L$model, levels=levels(da$model))
str(L)
## 'data.frame':    280 obs. of  6 variables:
##  $ model: Factor w/ 2 levels "trad","var": 1 1 1 1 1 1 1 1 1 1 ...
##  $ daa  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ y    : num  18.7 18.7 18.7 18.7 18.7 ...
##  $ se   : num  3.17 3.17 3.17 3.17 3.17 ...
##  $ lwr  : num  12.5 12.5 12.5 12.5 12.5 ...
##  $ upr  : num  24.9 24.9 24.9 24.9 24.9 ...
icm <- ddply(goi, .(daa), summarise,
             m=mean(peso),
             lwr=mean(peso)-qt(0.975, length(peso)-1)*sd(peso)*
             sqrt(1/length(peso)),
             upr=mean(peso)+qt(0.975, length(peso)-1)*sd(peso)*
             sqrt(1/length(peso)))

##-----------------------------------------------------------------------------
## Gráfico com as bandas de confiança.

ylim <- range(goi$peso)
xyplot(y~daa|model, data=L, type="l", col=1,
       ylab="Massa fresca dos frutos (g)",
       xlab="Dias após a antese",
       strip=strip.custom(bg="gray90"),
       ly=L$lwr, uy=L$upr, cty="bands",
       prepanel=prepanel.cbH, panel=panel.cbH)+
           layer(panel.arrows(icm$daa, icm$lwr, icm$daa, icm$upr,
                              code=3, length=0.05, angle=90))


Modelo segmentado para variável transformada

##-----------------------------------------------------------------------------
## Ajuste na escala log da resposta.

## log base 10 da resposta.
goi$lpeso <- log10(goi$peso)

## Semanas após antese.
goi$saa <- goi$daa/7

xyplot(log10(peso)~saa, data=goi, type=c("p","a"),
       ylab="log 10 do peso do fruto (g)", xlab="Dias após antese")

##-----------------------------------------------------------------------------
## Ajustar modelo linear segmentado.

## Modelo linear simples.
m0 <- lm(lpeso~saa, data=goi)

## 1 ponto de quebra.
s1 <- segmented(m0, seg.Z=~saa,
                psi=list(saa=c(17)),
                control=seg.control(display=FALSE))
summary(s1)
## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = m0, seg.Z = ~saa, psi = list(saa = c(17)), 
##     control = seg.control(display = FALSE))
## 
## Estimated Break-Point(s):
##     Est.  St.Err 
## 17.7200  0.6508 
## 
## t value for the gap-variable(s) V:  0 
## 
## Meaningful coefficients of the linear terms:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.693875   0.018733  37.040   <2e-16 ***
## saa          0.090500   0.001556  58.172   <2e-16 ***
## U1.saa      -0.096918   0.042570  -2.277       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1009 on 170 degrees of freedom
## Multiple R-Squared: 0.9607,  Adjusted R-squared:  0.96 
## 
## Convergence attained in 15 iterations with relative change 0.0008556697
confint(s1)
## $saa
##   Est. CI(95%).l CI(95%).u
##  17.72     16.44     19.01
class(s1) <- c("segmented","lm")
plot(lpeso~saa, data=goi)
plot(s1, add=TRUE, col=2)

class(s1) <- c("lm","segmented")
par(mfrow=c(2,2)); plot(s1); layout(1)

pureErrorAnova(s1)
## Analysis of Variance Table
## 
## Response: lpeso
##               Df Sum Sq Mean Sq   F value    Pr(>F)    
## saa            1 41.997  41.997 6004.8198 < 2.2e-16 ***
## U1.saa         1  0.319   0.319   45.6508 2.417e-10 ***
## psi1.saa       1  0.002   0.002    0.3227    0.5708    
## Residuals    170  1.731   0.010                        
##  Lack of fit   8  0.598   0.075   10.6813 5.276e-12 ***
##  Pure Error  162  1.133   0.007                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 pontos de quebra.
s2 <- update(s1, psi=list(saa=c(10,17)))
summary(s2)
## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = m0, seg.Z = ~saa, psi = list(saa = c(10, 17)), 
##     control = seg.control(display = FALSE))
## 
## Estimated Break-Point(s):
##            Est. St.Err
## psi1.saa 12.96 0.6145
## psi2.saa 16.78 0.2425
## 
## t value for the gap-variable(s) V:  0 0 
## 
## Meaningful coefficients of the linear terms:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.790262   0.019286  40.976   <2e-16 ***
## saa          0.073870   0.002382  31.010   <2e-16 ***
## U1.saa       0.075574   0.015347   4.924       NA    
## U2.saa      -0.161626   0.022992  -7.030       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08304 on 168 degrees of freedom
## Multiple R-Squared: 0.9737,  Adjusted R-squared: 0.9729 
## 
## Convergence attained in 2 iterations with relative change -1.915515e-16
slope(s2)
## $saa
##            Est.  St.Err. t value CI(95%).l CI(95%).u
## slope1  0.07387 0.002382 31.0100   0.06917   0.07857
## slope2  0.14940 0.015160  9.8570   0.11950   0.17940
## slope3 -0.01218 0.017290 -0.7048  -0.04631   0.02194
confint(s2)
## $saa
##           Est. CI(95%).l CI(95%).u
## psi1.saa 12.96     11.75     14.18
## psi2.saa 16.78     16.30     17.26
## class(s1) <- c("segmented","lm")
plot(lpeso~saa, data=goi)
plot(s2, add=TRUE, col=2)

class(s2) <- c("lm","segmented")
par(mfrow=c(2,2)); plot(s2); layout(1)

pureErrorAnova(s2)
## Analysis of Variance Table
## 
## Response: lpeso
##               Df Sum Sq Mean Sq   F value  Pr(>F)    
## saa            1 41.997  41.997 6004.8198 < 2e-16 ***
## U1.saa         1  0.021   0.021    2.9787 0.08627 .  
## U2.saa         1  0.871   0.871  124.4906 < 2e-16 ***
## psi1.saa       1  0.000   0.000    0.0289 0.86519    
## psi2.saa       1  0.000   0.000    0.0023 0.96175    
## Residuals    168  1.158   0.007                      
##  Lack of fit   6  0.025   0.004    0.6053 0.72580    
##  Pure Error  162  1.133   0.007                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Requer um ponto de quebra a mais?
anova(s1, s2)
## Analysis of Variance Table
## 
## Model 1: lpeso ~ saa + U1.saa + psi1.saa
## Model 2: lpeso ~ saa + U1.saa + U2.saa + psi1.saa + psi2.saa
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    170 1.7306                                  
## 2    168 1.1584  2   0.57223 41.494 2.265e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##-----------------------------------------------------------------------------
## Predição.

## er <- extendrange(goi$saa, f=0.1)
## pred <- data.frame(saa=seq(er[1], er[2], l=50))
## pred <- cbind(pred, predict(s2, newdata=pred, interval="confidence"))

## Declarar o modelo com a nls para fazer bandas de predição.
slope(s2)
## $saa
##            Est.  St.Err. t value CI(95%).l CI(95%).u
## slope1  0.07387 0.002382 31.0100   0.06917   0.07857
## slope2  0.14940 0.015160  9.8570   0.11950   0.17940
## slope3 -0.01218 0.017290 -0.7048  -0.04631   0.02194
str(goi)
## 'data.frame':    174 obs. of  10 variables:
##  $ daa   : int  13 13 13 13 13 13 13 13 13 13 ...
##  $ coleta: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ rep   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ long  : num  29.1 32.9 36.3 34.7 33.3 ...
##  $ trans : num  20.5 22.8 22.9 22.7 21.4 ...
##  $ peso  : num  6.66 8.51 9.93 9.14 7.6 7.54 6.27 9.94 7.95 7.79 ...
##  $ volume: int  6 9 11 10 7 8 6 9 8 8 ...
##  $ daa2  : num  609 609 609 609 609 ...
##  $ lpeso : num  0.823 0.93 0.997 0.961 0.881 ...
##  $ saa   : num  1.86 1.86 1.86 1.86 1.86 ...
model <- lpeso~b0+b1*saa+b2*(saa-xb1)*(saa>xb1)+b3*(saa-xb2)*(saa>xb2)
start <- list(b0=c(0.9, 0.5, 1.2), b1=c(0.07, 0.05, 0.2),
              b2=c(0.1, -0.2, 0.2), b3=c(0.1, -0.2, 0.2),
              xb1=c(10, 5, 15), xb2=c(17, 15, 20))

## rp.nls(model=model, start=start, data=goi)
## summary(rp.fit)
## dput(as.list(round(coef(rp.fit),3)))

start <- list(b0 = 0.793, b1 = 0.073, b2 = 0.067, xb1 = 12.496, 
    b3 = -0.154, xb2 = 16.856)
n0 <- nls(model, data=goi, start=start)

##-----------------------------------------------------------------------------
## Predição.

f <- function(th, saa){
    y <- th["b0"]+th["b1"]*saa+
        th["b2"]*(saa-th["xb1"])*(saa>th["xb1"])+
            th["b3"]*(saa-th["xb2"])*(saa>th["xb2"])
    return(y)
}

er <- extendrange(goi$saa, f=0.1)
pred <- data.frame(saa=seq(er[1], er[2], l=100))
pred$y <- f(coef(n0), saa=pred$saa)

F <- rootSolve::gradient(f=f, x=coef(n0),
                         saa=pred$saa)
U <- chol(vcov(n0))
pred$se <- sqrt(apply(F%*%t(U), 1, function(x) sum(x^2)))
aux <- outer(pred$se, qnorm(0.975)*c(lwr=-1, fit=0, upr=1), FUN="*")
pred <- cbind(pred, sweep(aux, MARGIN=1, STATS=pred$y, FUN="+"))
str(pred)
## 'data.frame':    100 obs. of  6 variables:
##  $ saa: num  0.0714 0.2879 0.5043 0.7208 0.9372 ...
##  $ y  : num  0.799 0.814 0.83 0.846 0.862 ...
##  $ se : num  0.0214 0.0208 0.0202 0.0196 0.019 ...
##  $ lwr: num  0.757 0.774 0.791 0.808 0.825 ...
##  $ fit: num  0.799 0.814 0.83 0.846 0.862 ...
##  $ upr: num  0.84 0.855 0.87 0.884 0.899 ...
p1 <- xyplot(log10(peso)~saa, data=goi,
             ylab="log 10 do peso do fruto (g)",
             xlab="Dias após antese")
p2 <- xyplot(fit~saa, data=pred, type="l",
             ly=pred$lwr, uy=pred$upr, cty="bands",
             prepanel=prepanel.cbH, panel=panel.cbH)

p1+as.layer(p2, under=TRUE)+
    layer(panel.abline(v=coef(n0)[c("xb1","xb2")], lty=2))


##-----------------------------------------------------------------------------
## Informações da sessão.

Sys.time()
## [1] "2014-12-11 21:06:13 BRST"
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: i686-pc-linux-gnu (32-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=pt_BR.UTF-8       
##  [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  base     
## 
## other attached packages:
##  [1] wzRfun_0.4          segmented_0.5-0.0   nlme_3.1-118        reshape_0.8.5      
##  [5] plyr_1.8.1          alr3_2.0.5          car_2.0-22          gridExtra_0.9.1    
##  [9] latticeExtra_0.6-26 RColorBrewer_1.0-5  lattice_0.20-29     rmarkdown_0.3.3    
## [13] knitr_1.8          
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.4      doBy_4.5-12       evaluate_0.5.5    formatR_1.0      
##  [5] htmltools_0.2.6   MASS_7.3-35       Matrix_1.1-4      methods_3.1.2    
##  [9] multcomp_1.3-7    mvtnorm_1.0-1     nnet_7.3-8        Rcpp_0.11.3      
## [13] rootSolve_1.6.5.1 sandwich_2.3-2    splines_3.1.2     stringr_0.6.2    
## [17] survival_2.37-7   TH.data_1.0-5     tools_3.1.2       yaml_2.1.13      
## [21] zoo_1.7-11