# 1 Exemplo com correlação

Neste exemplo, será usado Jackknife para fazer inferência sobre o coeficiente de correlação de Pearson.

# Fertilidade (y) vs percentual de homens ocupados na agricultura (x).
plot(Fertility ~ Agriculture, data = swiss)

# O coeficiente de correlação de Pearson possui método de
# inferência. Porém, depende do atendimento dos pressupostos, etc.
with(swiss, cor.test(x = Fertility, y = Agriculture))
##
##  Pearson's product-moment correlation
##
## data:  Fertility and Agriculture
## t = 2.5316, df = 45, p-value = 0.01492
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.07334947 0.58130587
## sample estimates:
##       cor
## 0.3530792
# Estimativa com todas as observações.
rho <- with(swiss, cor(x = Fertility, y = Agriculture))
rho
## [1] 0.3530792
n <- nrow(swiss) # Número de observações.
i <- 1:n         # Índices.

# Estimativas parciais (partial estimates).
pe <- sapply(i,
FUN = function(ii) {
with(swiss[-ii, ],
cor(x = Fertility, y = Agriculture))
})
sort(pe)
##  [1] 0.2527003 0.3130895 0.3236550 0.3252307 0.3295657 0.3373877 0.3384854
##  [8] 0.3394874 0.3431902 0.3438933 0.3463836 0.3472831 0.3473921 0.3484359
## [15] 0.3515465 0.3525519 0.3526445 0.3529275 0.3532247 0.3544081 0.3544769
## [22] 0.3547668 0.3548660 0.3556879 0.3573136 0.3575933 0.3580389 0.3582101
## [29] 0.3584283 0.3598261 0.3600821 0.3603786 0.3607963 0.3622056 0.3629919
## [36] 0.3630053 0.3633260 0.3634777 0.3639524 0.3646809 0.3665158 0.3690244
## [43] 0.3693343 0.3700471 0.3787130 0.3872608 0.3920289
# Pseudo valores.
pv <- n * rho - (n - 1) * pe
sort(pv)
##  [1] -1.438609208 -1.219276002 -0.826074886 -0.427443555 -0.394654825
##  [6] -0.380399765 -0.265005713 -0.180598644 -0.147089195 -0.125250512
## [11] -0.118273658 -0.103522641 -0.102906932 -0.066738055 -0.001907453
## [16]  0.017305935  0.030944693  0.042721510  0.107019784  0.117054764
## [21]  0.124930800  0.145427596  0.158296835  0.233079893  0.270887611
## [26]  0.275450148  0.288782306  0.291947068  0.346384176  0.360058426
## [31]  0.373075948  0.377332330  0.423580770  0.566671387  0.614684885
## [36]  0.619697687  0.661076544  0.775631850  0.807971775  0.978302879
## [41]  1.024391458  1.074886874  1.434699932  1.634111551  1.706590742
## [46]  2.192606635  4.970505848
# ATENÇÃO: alguns pseudo valores tem valores fora do espaço paramétrico
# da correlação que é [-1, 1].

# Estimativa pontual Jackknife.
mean(pv)
## [1] 0.3669864
# Erro-padrão Jackknife.
sqrt(var(pv)/n)
## [1] 0.1407584
# Intervalo de confiança Jackknife (supõe independência e normalidade).
mean(pv) + c(-1, 1) * qt(p = 0.975, df = n - 1) * sqrt(var(pv)/n)
## [1] 0.0836545 0.6503182
# Intervalo de confiança para a correlação de Pearson.
with(swiss,
cor.test(x = Fertility, y = Agriculture)$conf.int) ## [1] 0.07334947 0.58130587 ## attr(,"conf.level") ## [1] 0.95 Em qual dos dois tipos de intervalo de confiança é mais seguro (para não usar o termo confiável)? O que é ser mais seguro? # 2 Exemplo com regressão não linear library(tidyverse) # Carregando dados. data(segreg, package = "alr3") # Visualizando os dados. ggplot(segreg, aes(x = Temp, y = C)) + geom_point() + geom_smooth(se = FALSE, span = 0.4) ## geom_smooth() using method = 'loess' # Encontrando valores iniciais. start <- list(th0 = 75, th1 = 0.5, th2 = 50) ggplot(segreg, aes(x = Temp, y = C)) + geom_point() + geom_smooth(se = FALSE, span = 0.4) ## geom_smooth() using method = 'loess' plot(C ~ Temp, data = segreg) with(start, { curve(th0 + th1 * (x - th2) * (x >= th2) + 0 * (x < th2), add = TRUE) }) # Ajuste do modelo aos dados. n0 <- nls(C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) + 0 * (Temp < th2), data = segreg, start = start) summary(n0) ## ## Formula: C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) + 0 * (Temp < th2) ## ## Parameters: ## Estimate Std. Error t value Pr(>|t|) ## th0 74.6953 1.3433 55.607 < 2e-16 *** ## th1 0.5674 0.1006 5.641 2.10e-06 *** ## th2 41.9512 4.6583 9.006 9.43e-11 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 5.373 on 36 degrees of freedom ## ## Number of iterations to convergence: 4 ## Achieved convergence tolerance: 1.141e-08 confint.default(n0) ## 2.5 % 97.5 % ## th0 72.0625625 77.3281125 ## th1 0.3702916 0.7645672 ## th2 32.8212184 51.0812375 # Curva ajustada sobreposta aos dados. plot(C ~ Temp, data = segreg) with(as.list(coef(n0)), { curve(th0 + th1 * (x - th2) * (x >= th2) + 0 * (x < th2), add = TRUE) }) #----------------------------------------------------------------------- # Aplicando o método Jackknife. # Pseudo valores. n <- nrow(segreg) i <- 1:n pv <- sapply(i, FUN = function(ii) { # Reajusta o modelo. n1 <- update(n0, data = segreg[-ii, ], start = coef(n0)) # Obtém os pseudo valores. n * coef(n0) - (n - 1) * coef(n1) }) pv ## [,1] [,2] [,3] [,4] [,5] [,6] ## th0 103.8767098 68.8478030 73.1790431 58.3058430 68.7659764 79.2263631 ## th1 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297 ## th2 93.3785551 31.6459300 39.2790205 13.0674780 31.5017241 49.9364168 ## [,7] [,8] [,9] [,10] [,11] [,12] ## th0 97.3624965 73.4240164 65.9876697 83.5912964 85.7694564 65.5790430 ## th1 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297 0.5674297 ## th2 81.8983370 39.7107453 26.6054205 57.6288862 61.4675311 25.8852839 ## [,13] [,14] [,15] [,16] [,17] [,18] ## th0 79.6349897 74.0778697 18.783115 87.7115098 74.695339 74.6953392 ## th1 0.5674297 0.5674297 -0.838256 0.5674297 -0.512521 0.4687384 ## th2 50.6565534 40.8630531 -112.931446 64.8900773 -6.461628 37.2292207 ## [,19] [,20] [,21] [,22] [,23] [,24] ## th0 74.6953385 74.6953377 74.6953379 74.695338 74.6953395 74.6953390 ## th1 0.9128637 0.4778908 0.1261413 1.433339 0.6711386 0.6726583 ## th2 59.3790948 37.5142396 20.3100356 87.825906 47.3775618 51.1900718 ## [,25] [,26] [,27] [,28] [,29] [,30] ## th0 74.6953391 74.6953385 74.6953381 74.6953373 74.6953376 74.69534 ## th1 0.7105989 0.6166268 0.5711587 0.6031183 0.9077715 0.61858 ## th2 59.8015242 76.2712899 39.0170175 32.4853470 26.0920991 40.71884 ## [,31] [,32] [,33] [,34] [,35] [,36] ## th0 74.6953381 74.6953386 74.6953381 74.6953384 74.695339 74.6953381 ## th1 0.7246327 0.6716795 0.5243044 0.8085346 1.483175 0.5039746 ## th2 42.2766428 43.0727384 41.4490078 45.3822387 56.480970 40.8791466 ## [,37] [,38] [,39] ## th0 74.695338 74.6953379 74.6953382 ## th1 1.137357 -0.4122876 -0.8823282 ## th2 52.248344 23.3200786 12.9017403 # Estimativas pontuais de Jackknife e erros padrões. jkest <- cbind("JK_Estimate" = apply(pv, MARGIN = 1, FUN = mean), "JK_StdError" = apply(pv, MARGIN = 1, FUN = sd)/sqrt(n)) jkest ## JK_Estimate JK_StdError ## th0 74.413230 1.90986928 ## th1 0.525906 0.07637956 ## th2 40.057567 5.17480685 # Obtendo o IC. me <- outer(X = c(lwr = -1, upr = 1) * qt(0.975, df = n - length(coef(n0))), Y = jkest[, 2], FUN = "*") t(sweep(me, MARGIN = 2, STATS = jkest[, 1], FUN = "+")) ## lwr upr ## th0 70.539836 78.2866247 ## th1 0.371001 0.6808109 ## th2 29.562572 50.5525613 #----------------------------------------------------------------------- # Usando a função nlstools::nlsJack(). library(nlstools) ## ## 'nlstools' has been loaded. ## IMPORTANT NOTICE: Most nonlinear regression models and data set examples ## related to predictive microbiolgy have been moved to the package 'nlsMicrobio' jk <- nlsJack(n0) summary(jk) ## ## ------ ## Jackknife statistics ## Estimates Bias ## th0 74.413230 0.28210720 ## th1 0.525906 0.04152343 ## th2 40.057567 1.89366134 ## ## ------ ## Jackknife confidence intervals ## Low Up ## th0 70.539836 78.2866247 ## th1 0.371001 0.6808109 ## th2 29.562572 50.5525613 ## ## ------ ## Influential values ## * Observation 1 is influential on th0 ## * Observation 4 is influential on th0 ## * Observation 7 is influential on th0 ## * Observation 15 is influential on th0 ## * Observation 15 is influential on th1 ## * Observation 39 is influential on th1 ## * Observation 15 is influential on th2 ggplot(segreg, aes(x = Temp, y = C)) + # geom_point() + geom_text(label = 1:nrow(segreg)) # 3 Exemplo com regressão polinomial # Carrega os dados da web. url <- "https://web.stanford.edu/~hastie/CASI_files/DATA/kidney.txt" kidney <- read.table(url, header = TRUE, sep = " ") str(kidney) ## 'data.frame': 157 obs. of 2 variables: ##$ age: int  18 19 19 20 21 21 21 22 22 22 ...
##  \$ tot: num  2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ...
# Visualiza os dados.
plot(tot ~ age, data = kidney)

# Ajuste do modelo polinomial.
m0 <- lm(tot ~ poly(age, degree = 5), data = kidney)
summary(m0)
##
## Call:
## lm(formula = tot ~ poly(age, degree = 5), data = kidney)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -4.1758 -1.3138  0.0441  1.0551  4.5577
##
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)            -1.911e-04  1.454e-01  -0.001    0.999
## poly(age, degree = 5)1 -1.563e+01  1.822e+00  -8.576 1.09e-14 ***
## poly(age, degree = 5)2 -4.758e-01  1.822e+00  -0.261    0.794
## poly(age, degree = 5)3 -1.082e-01  1.822e+00  -0.059    0.953
## poly(age, degree = 5)4 -4.028e-01  1.822e+00  -0.221    0.825
## poly(age, degree = 5)5 -9.224e-01  1.822e+00  -0.506    0.614
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.822 on 151 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.3064
## F-statistic: 14.79 on 5 and 151 DF,  p-value: 8.509e-12
# Checando os pressupostos.
par(mfrow = c(2, 2))
plot(m0)