Testes de aleatorização

Uma senhora toma chá

#-----------------------------------------------------------------------
# Uma senhora toma chá.

# Quantidade de maneiras de gerar dois grupos de 4 xícaras usando 8.
n_poss <- choose(8, 4)
n_poss

## [1] 70

# De 4 xícaras selecionadas para um grupo, X representa o número de
# acertos.
n_acertar <- sapply(4:0,
                    FUN = function(i) {
                        # (formas de acertar x em 4) * (formas de errar x em 4).
                        choose(4, i) * choose(4, 4 - i)
                    })
n_acertar

## [1]  1 16 36 16  1

# Pr(Acerto total) = 1/70 < 5%.
cumsum(n_acertar)/n_poss

## [1] 0.01428571 0.24285714 0.75714286 0.98571429 1.00000000

# Comprovando por simulação.
v <- rep(0:1, each = 4)
mean(replicate(n = 100000,
               expr = all(sample(v) == v)))

## [1] 0.01473

# Essa forma de fazer a conta é mais realista mas dá na mesma.
mean(replicate(n = 100000,
               expr = all(sample(v) == sample(v))))

## [1] 0.01409

Teste para a diferença de médias

#-----------------------------------------------------------------------
# Exemplo com teste para a diferença de médias.

# Comprimentos de crânios de cães pré-históricos.
m <- c(120, 107, 110, 116, 114, 111, 113, 117, 114, 112)
f <- c(110, 111, 107, 108, 110, 105, 107, 106, 111, 111)

plot(ecdf(m), xlim = range(c(m, f)), col = "cyan")
lines(ecdf(f), col = "magenta")
rug(m, col = "cyan")
rug(f, col = "magenta")

# Diferença de média observada.
d <- mean(m) - mean(f)
d

## [1] 4.8

# Todos as combinações possíveis.
choose(n = 20, k = 10)

## [1] 184756

#--------------------------------------------
# Com todas as combinações possíveis (exaustão).

# Para construir todas as combinações possíveis.
k <- combn(x = 1:20, m = 10)
dim(k)

## [1]     10 184756

# Vetor com os valores dos dois grupos concatenados.
mf <- c(m, f)

# Vetor cheio de zeros.
g <- integer(20)

# Calcula a diferença para todo arranjo possível.
D <- apply(k,
           MARGIN = 2,
           FUN = function(i) {
               # Alguns elementos do vetor convertidos para 1.
               g[i] <- 1L
               # Cálculo da estatística de teste.
               -diff(tapply(mf, g, FUN = mean))
           })

# Histograma da distribuição da estatística sib H_0.
hist(D, col = "gray50")
rug(D)
abline(v = d, col = 2)

plot(ecdf(D), cex = 0)
rug(D)
abline(v = d, col = 2)

# P-valor do teste.
2 * sum(D >= d)/length(D)

## [1] 0.003334127

#--------------------------------------------
# Com simulação (não necessáriamente exaustivo).

# Variáveis que indentifica os grupos.
g <- rep(1:2, each = 10)

# Apenas para conferir.
cbind(g, mf)

##       g  mf
##  [1,] 1 120
##  [2,] 1 107
##  [3,] 1 110
##  [4,] 1 116
##  [5,] 1 114
##  [6,] 1 111
##  [7,] 1 113
##  [8,] 1 117
##  [9,] 1 114
## [10,] 1 112
## [11,] 2 110
## [12,] 2 111
## [13,] 2 107
## [14,] 2 108
## [15,] 2 110
## [16,] 2 105
## [17,] 2 107
## [18,] 2 106
## [19,] 2 111
## [20,] 2 111

# Replicando a diferença para grupos formados por aleatorização.
D <- replicate(9999, {
    gg <- sample(g)
    -diff(tapply(mf, gg, FUN = mean))
})

# Tem que juntar a estatística observada com as simuladas.
D <- c(D, d)

hist(D, col = "gray50")
rug(D)
abline(v = d, col = 2)

plot(ecdf(D), cex = 0)
rug(D)
abline(v = d, col = 2)

# P-valor do teste.
2 * sum(D >= d)/length(D)

## [1] 0.003

Teste para a correlação

#-----------------------------------------------------------------------
# Teste de aleatorização para a correlação.

# N = 5 para um par de medidas.
x <- c(4.1, 8.3, 2.9, 10.8, 9.5)
y <- c(3.7, 5.1, 1.0, 7.7, 8.9)

cbind(x, y)

##         x   y
## [1,]  4.1 3.7
## [2,]  8.3 5.1
## [3,]  2.9 1.0
## [4,] 10.8 7.7
## [5,]  9.5 8.9

plot(x, y)

# Estatística de teste na amostra original.
r0 <- cor(x, y, method = "pearson")
r0

## [1] 0.9228669

# Todas as permutações possiveis: 5! = 120 do vetor x.
X <- gtools::permutations(n = length(x), r = length(x), v = x)
str(X)

##  num [1:120, 1:5] 2.9 2.9 2.9 2.9 2.9 2.9 2.9 2.9 2.9 2.9 ...

head(X)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]  2.9  4.1  8.3  9.5 10.8
## [2,]  2.9  4.1  8.3 10.8  9.5
## [3,]  2.9  4.1  9.5  8.3 10.8
## [4,]  2.9  4.1  9.5 10.8  8.3
## [5,]  2.9  4.1 10.8  8.3  9.5
## [6,]  2.9  4.1 10.8  9.5  8.3

# As 120 correlações obtidas para cada arranjo.
r <- apply(X, MARGIN = 1, FUN = cor, y = y, method = "spearman")

# P-valor do teste.
2 * sum(r >= r0)/length(r)

## [1] 0.01666667

Índice de Moran

#-----------------------------------------------------------------------
# Índice de Moran para medir dependência espacial.

# Coordenadas dos eventos em uma malha regular.
x <- 1:8
y <- 1:8

# Para criar a matriz de pesos.
ind <- expand.grid(i = 1:length(x), j = 1:length(y))
f <- function(i, j) {
    u <- min(3, sum(abs(ind[i, ] - ind[j, ])))
    c(0, 1, sqrt(1/2), 0)[u + 1]
}
w <- matrix(0, nrow(ind), nrow(ind))
for (i in 1:nrow(ind)) {
    for (j in 1:nrow(ind)) {
        w[i, j] <- f(i, j)
    }
}
w <- w/sum(w)

image(w)

# Índice de Moran.
moran <- function(x, weights) {
    # Tamanho da amostra.
    n <- length(x)
    # Valores normalizados.
    z <- as.vector((x - mean(x))/sd(x))
    # Índice de Moran.
    as.vector(z %*% weights %*% (z * sqrt(n/(n - 1))))
}

# Teste de permutação com gráfico.
ppt <- function(z, w, N = 10000, stat, ...) {
    # Índice de Moran por reamostragem.
    sim <- replicate(N,
                     moran(sample(z), w))
    # Determina o p-valor.
    p.value <- mean((all <- c(stat, sim)) >= stat)
    # Histograma da distribuição empírica sob H_0.
    hist(sim,
         sub = paste("p =", round(p.value, 4)),
         xlim = range(all),
         ...)
    abline(v = stat, col = "#903030", lty = 3, lwd = 2)
    return(p.value)
}

# Observações simuladas.
set.seed(17)
par(mfrow = c(2, 3))

# Dados com dependência espacial ---------------------------------------
# Indução de autocorrelação por meio de uma tendência.
z <- matrix(rexp(length(x) * length(y),
                 outer(x, y^2)),
            length(x))
image(log(z), main = "Com dependência")

# Índice de Moran com dados originais.
stat <- moran(z, w)
stat

## [1] 0.06499871

hist(z)
ppt(z, w, stat = stat, main = "I de Moran", xlab = "I")

## [1] 0.01459854

# Dados sem dependência espacial ---------------------------------------
# Geração de de um conjunto de dados sob hipótese nula.
z <- matrix(rnorm(length(x) * length(y), 0, 1/2), length(x))
image(z, main = "Sem dependência")

# Índice de Moran com dados originais.
stat <- moran(z, w)
stat

## [1] -0.0121901

hist(z)
ppt(z, w, stat = stat, main = "I de Moran", xlab = "I")

## [1] 0.4478552

Jackknife

Regressão não linear

#-----------------------------------------------------------------------
# Aplicação de Jackknife em modelos de regressão não linear.

library(latticeExtra)
library(car)
library(alr3)

# Curva "palpite".
start <- list(th0 = 75, th1 = 0.5, th2 = 50)
xyplot(C ~ Temp, data = segreg) +
    layer(panel.curve(th0 + th1 * (x - th2) * (x >= th2) +
                      0 * (x < th2), lty = 2),
          data = start)

# Ajuste.
n0 <- nls(C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) +
              0 * (Temp < th2),
          data = segreg,
          start = start)

# Estimativas e medidas de ajuste.
summary(n0)

## 
## Formula: C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) + 0 * (Temp < th2)
## 
## Parameters:
##     Estimate Std. Error t value Pr(>|t|)    
## th0  74.6953     1.3433  55.607  < 2e-16 ***
## th1   0.5674     0.1006   5.641 2.10e-06 ***
## th2  41.9512     4.6583   9.006 9.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.373 on 36 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 1.141e-08

# Observados e preditos.
xyplot(C ~ Temp, data = segreg) +
    layer(panel.curve(th0 + th1 * (x - th2) * (x >= th2) +
                      0 * (x < th2)),
          data = as.list(coef(n0)))

#-----------------------------------------------------------------------
# Estimativas Jackknife para os parâmetros.


theta <- coef(n0) # Estimativas com todos os dados.
n <- nrow(segreg) # Número de observações.

# Matriz vazia para armazenar os pseudo-valores.
thetaj <- matrix(0, nrow = n, ncol = length(theta))
colnames(thetaj) <- names(theta)

# Ajustes deixando uma observação de fora (leave-one-out).
for (i in 1:n) {
    # Reajusta o modelo, i.e. estima com leave-one-out.
    n1 <- nls(C ~ th0 + th1 * (Temp - th2) * (Temp >= th2) +
                  0 * (Temp < th2),
              data = segreg[-i, ],
              start = coef(n0))
    # Calcula os pseudo valores.
    thetaj[i, ] <- n * theta - (n - 1) * coef(n1)
}

# Os primeiros pseudo valores.
head(thetaj)

##            th0       th1      th2
## [1,] 103.87671 0.5674297 93.37856
## [2,]  68.84780 0.5674297 31.64593
## [3,]  73.17904 0.5674297 39.27902
## [4,]  58.30584 0.5674297 13.06748
## [5,]  68.76598 0.5674297 31.50172
## [6,]  79.22636 0.5674297 49.93642

# Estimativas pontuais.
jk <- colMeans(thetaj)
cbind(MLE = theta, Jack = jk, Vício = theta - jk)

##            MLE      Jack      Vício
## th0 74.6953375 74.413230 0.28210720
## th1  0.5674294  0.525906 0.04152343
## th2 41.9512279 40.057567 1.89366134

# Erros padrões.
cbind(MLE = summary(n0)$coefficients[, "Std. Error"],
      Jack = apply(thetaj, MARGIN = 2, sd)/sqrt(n))

##           MLE       Jack
## th0 1.3432772 1.90986928
## th1 0.1005824 0.07637956
## th2 4.6582537 5.17480685

# Pacote com função pronta para Jackknife para modelos não lineares.
library(nlstools)
# ls("package:nlstools")

# Aplica Jackknife sobre o modelo.
j0 <- nlsJack(n0)
summary(j0)

## 
## ------
## Jackknife statistics
##     Estimates       Bias
## th0 74.413230 0.28210720
## th1  0.525906 0.04152343
## th2 40.057567 1.89366134
## 
## ------
## Jackknife confidence intervals
##           Low         Up
## th0 70.539836 78.2866247
## th1  0.371001  0.6808109
## th2 29.562572 50.5525613
## 
## ------
## Influential values
## * Observation 1 is influential on th0 
## * Observation 4 is influential on th0 
## * Observation 7 is influential on th0 
## * Observation 15 is influential on th0 
## * Observation 15 is influential on th1 
## * Observation 39 is influential on th1 
## * Observation 15 is influential on th2

Dose letal para 50% da população

#-----------------------------------------------------------------------
# Inferência para a DL 50.

# library(labestData)
# data(PaulaTb3.12)
# str(PaulaTb3.12)
# help(PaulaTb3.12, h = "html")
# # Converte a resposta para binário.
# PaulaTb3.12$resp <- as.integer(PaulaTb3.12$resp) - 1
# dput(PaulaTb3.12)

PaulaTb3.12 <-
structure(list(vol = c(3.7, 3.5, 1.25, 0.75, 0.8, 0.7, 0.6, 1.1,
0.9, 0.9, 0.8, 0.55, 0.6, 1.4, 0.75, 2.3, 3.2, 0.85, 1.7, 1.8,
0.4, 0.95, 1.35, 1.5, 1.6, 0.6, 1.8, 0.95, 1.9, 1.6, 2.7, 2.35,
1.1, 1.1, 1.2, 0.8, 0.95, 0.75, 1.3), razao = c(0.825, 1.09,
2.5, 1.5, 3.2, 3.5, 0.75, 1.7, 0.75, 0.45, 0.57, 2.75, 3, 2.33,
3.75, 1.64, 1.6, 1.415, 1.06, 1.8, 2, 1.36, 1.35, 1.36, 1.78,
1.5, 1.5, 1.9, 0.95, 0.4, 0.75, 0.03, 1.83, 2.2, 2, 3.33, 1.9,
1.9, 1.625), resp = c(1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
1, 1, 0, 0, 1)), .Names = c("vol", "razao", "resp"), row.names = c(NA,
39L), class = "data.frame")

layout(1)
plot(resp ~ vol, data = PaulaTb3.12)

m0 <- glm(resp ~ vol,
          data = PaulaTb3.12,
          family = binomial)
summary(m0)

## 
## Call:
## glm(formula = resp ~ vol, family = binomial, data = PaulaTb3.12)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8341  -1.0023   0.2719   1.1185   1.4978  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.6627     0.8123  -2.047   0.0407 *
## vol           1.3357     0.6162   2.168   0.0302 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.040  on 38  degrees of freedom
## Residual deviance: 46.989  on 37  degrees of freedom
## AIC: 50.989
## 
## Number of Fisher Scoring iterations: 4

# DL_50.
dl <- -coef(m0)[1]/coef(m0)[2]

# str(m0$family)$link

plot(resp ~ vol, data = PaulaTb3.12)
curve(m0$family$linkinv(coef(m0)[1] + coef(m0)[2] * x),
      add = TRUE)
abline(v = dl, h = 0.5, lty = 2)

n <- nrow(PaulaTb3.12)
i <- 1:n

# Estimativas por Jackknife.
DL <- sapply(i,
             FUN = function(i) {
                 m0 <- glm(resp ~ vol,
                           data = PaulaTb3.12[-i, ],
                           family = binomial)
                 # Estimativa Parcial.
                 DL <- -coef(m0)[1]/coef(m0)[2]
                 # Pseudo-valor.
                 n * dl - (n - 1) * DL
             })

m <- mean(DL)
e <- sd(DL)/sqrt(n)

cbind(Est = rbind(MLE = dl,
                  Jack = m),
      StdErr = rbind(MLE = car::deltaMethod(m0, g = "-(Intercept)/vol")["SE"],
                     Jack = e))

##      (Intercept)        SE
## MLE     1.244812 0.2632587
## Jack    1.254201 0.2657998

plot(ecdf(DL))
rug(DL)

plot(density(DL))
rug(DL)

# NOTE: alguma pista sobre porque a distribuição fica bimodal?

Uma situação problemática

#-----------------------------------------------------------------------
# Aplicação na Mediana (derivada não suave) -> problema.

# Ordena o vetor de valores.
x <- sort(precip)

# Calcula a mediana com os dados originais.
M <- median(x)
M

## [1] 36.6

n <- length(x)
i <- 1:length(x)

# Estimativas da mediana por Jackknife.
y <- sapply(i,
            FUN = function(i) {
                ep <- median(x[-i])
                pv <- n * M - (n - 1) * ep
                return(pv)
            })

stem(y)

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   0 | 99999999999999999999999999999999999
##   1 | 
##   2 | 
##   3 | 
##   4 | 
##   5 | 
##   6 | 44444444444444444444444444444444444

# NOTE: Explique porque o Jackknife para a mediana só retornou 2
# valores? Considere que o tamanho da amostra é dado abaixo.
length(x)

## [1] 70

Bootstrap

Regressão não linear

#-----------------------------------------------------------------------
# Aplicação de bootstrap. Estudo de caso com modelos não lineares.

library(alr3)
str(turk0)

## 'data.frame':    35 obs. of  2 variables:
##  $ A   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gain: int  644 631 661 624 633 610 615 605 608 599 ...

# Gráfico dos valores observados.
plot(Gain ~ A, data = turk0)

# Lista com os valores iniciais.
start <- list(Int = 625, Ass = 180, Mei = 0.1)

# Adiciona as curvas.
with(start, {
    curve(Int + Ass * A/(Mei + A),
          xname = "A", add = TRUE, col = 2)
    curve(Int + Ass * (1 - 2^(-A/Mei)),
          xname = "A", add = TRUE, col = 4)})

# Ajuste do primeiro modelo.
n0 <- nls(Gain ~ Int + Ass * A/(Mei + A),
          data = turk0,
          start = start)

# Ajuste do segundo modelo.
n1 <- nls(Gain ~ Int + Ass * (1 - 2^(-A/Mei)),
          data = turk0,
          start = start)

cbind(coef(n0), coef(n1))

##            [,1]         [,2]
## Int 622.2417493 622.95805415
## Ass 235.2174693 178.25190835
## Mei   0.1547585   0.09732193

#-----------------------------------------------------------------------
# Criando funções que fazem reamostragem dos dados e ajustam o modelo.

n <- nrow(turk0)
i <- 1:n

myboot0 <- function(formula, start) {
    n0 <- nls(formula = formula,
              data = turk0[sample(i, n, replace = TRUE), ],
              start = start)
    coef(n0)
}

#-----------------------------------------------------------------------
# Replicando.

# B = 999 reamostragens.
b0 <- replicate(999,
                myboot0(Gain ~ Int + Ass * A/(Mei + A), start))
b1 <- replicate(999,
                myboot0(Gain ~ Int + Ass * (1 - 2^(-A/Mei)), start))

# Conjunto de 1000 estimativas bootstrap.
b0 <- cbind(b0, coef(n0))
b1 <- cbind(b1, coef(n1))
str(b0)

##  num [1:3, 1:1000] 624.917 227.566 0.153 624.617 239.092 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "Int" "Ass" "Mei"
##   ..$ : NULL

# Calcula o vício absoluto (bootstrap - original).
rbind(n0 = rowMeans(b0) - coef(n0),
      n1 = rowMeans(b1) - coef(n1))

##           Int      Ass         Mei
## n0 -0.1248847 4.643737 0.007294549
## n1 -0.2939218 2.378039 0.002272771

# Variância do estimador.
rbind(n0 = apply(b0, MARGIN = 1, var),
      n1 = apply(b1, MARGIN = 1, var))

##         Int      Ass          Mei
## n0 35.84423 943.7282 0.0023195062
## n1 31.90712 158.5418 0.0002190582

# Função para calcular o erro quadrático médio.
eqm <- function(x) {
    sum((x - tail(x, 1))^2)/length(x)
}

# Erro quadrático médio.
rbind(n0 = apply(b0, 1, FUN = eqm),
      n1 = apply(b1, 1, FUN = eqm))

##         Int      Ass          Mei
## n0 35.82398 964.3487 0.0023703971
## n1 31.96160 164.0384 0.0002240046

Inferência para valores preditos

#-----------------------------------------------------------------------
# Curvas ajustadas de onde é possível determinar uma banda de confiança.

bts <- cbind(rep(1:2, each = ncol(b0)),
             as.data.frame(rbind(t(b0), t(b1))))
splom(bts[, -1], groups = bts[, 1])

par(mfrow = c(1, 2))
plot(Gain ~ A, data = turk0, type = "n")
apply(b0,
      MARGIN = 2,
      FUN = function(b) {
          curve(b[1] + b[2] * x/(b[3] + x),
                add = TRUE, col = rgb(0, 1, 1, 0.25), n = 51)
          invisible()
      })

## NULL

points(Gain ~ A, data = turk0)
abline(v = 0.2, lty = 2)
plot(Gain ~ A, data = turk0, type = "n")
apply(b1,
      MARGIN = 2,
      FUN = function(b) {
          curve(b[1] + b[2] * (1 - 2^(-x/b[3])),
                add = TRUE, col = rgb(1, 1, 0, 0.25), n = 51)
          invisible()
      })

## NULL

points(Gain ~ A, data = turk0)
abline(v = 0.2, lty = 2)

#-----------------------------------------------------------------------
# Estimativa do valor da função em x = 0.2.

p0 <- apply(b0, MARGIN = 2,
            FUN = function(b){
                b[1] + b[2] * 0.2/(b[3] + 0.2)
            })
p1 <- apply(b1, MARGIN = 2,
            FUN = function(b){
                b[1] + b[2] * (1 - 2^(-0.2/b[3]))
            })

cbind(Est = rbind(mean(p0), mean(p1)),
      StdErr = rbind(sd(p0), sd(p1)))

##          [,1]     [,2]
## [1,] 754.9576 4.516552
## [2,] 758.4149 4.673427

Usando o pacote `boot`

#-----------------------------------------------------------------------
# Usando o pacote boot.

library(boot)
library(latticeExtra)

# dput(as.list(coef(n1)))
start <- list(Int = 622.958054146589,
              Ass = 178.251908347242,
              Mei = 0.097321927254495)

# Criar uma função com dois argumentos: o data.frame original e um vetor
# que vai representar o índice das linhas usado para reamostrar dos
# dados.
fitmodel <- function(dataset, index) {
    n0 <- nls(Gain ~ Int + Ass * (1 - 2^(-A/Mei)),
              data = dataset[index, ],
              start = start)
    c(coef(n0), s2 = deviance(n0)/df.residual(n0))
}

set.seed(321)
b0 <- boot(data = turk0,
           statistic = fitmodel,
           R = 1999)

class(b0)

## [1] "boot"

methods(class = class(b0))

## [1] as.data.frame c             confint       hist          plot         
## [6] print         summary       vcov         
## see '?methods' for accessing help and source code

summary(b0)

##      R   original    bootBias    bootSE    bootMed
## 1 1999 622.958054   0.0225743  5.726720 622.752517
## 2 1999 178.251908   1.9825733 15.015128 179.120315
## 3 1999   0.097322   0.0025534  0.018418   0.097733
## 4 1999 386.481790 -31.1163672 69.507327 353.375813

# Como são obtidos.
theta <- b0$t

# Estimativas como conjunto de dados original.
b0$t0

##          Int          Ass          Mei           s2 
## 622.95805415 178.25190835   0.09732193 386.48178994

# Vício.
apply(theta, MARGIN = 2, FUN = mean) - b0$t0

##           Int           Ass           Mei            s2 
##   0.022574309   1.982573260   0.002553388 -31.116367189

# Desvio-padrão boostrap.
apply(theta, MARGIN = 2, FUN = sd)

## [1]  5.72671967 15.01512825  0.01841813 69.50732700

# Mediana bootstrap.
apply(theta, MARGIN = 2, FUN = median)

## [1] 622.75251653 179.12031530   0.09773343 353.37581349

st <- summary(b0)
str(st)

## Classes 'summary.boot' and 'data.frame': 4 obs. of  5 variables:
##  $ R       : num  1999 1999 1999 1999
##  $ original: num  622.9581 178.2519 0.0973 386.4818
##  $ bootBias: num  0.02257 1.98257 0.00255 -31.11637
##  $ bootSE  : num  5.7267 15.0151 0.0184 69.5073
##  $ bootMed : num  622.7525 179.1203 0.0977 353.3758

# Gráfico de pares.
pairs(b0)

vcov(b0)

##              [,1]        [,2]         [,3]          [,4]
## [1,]  32.79531815 -18.9667661 0.0398778540   69.43297130
## [2,] -18.96676614 225.4540763 0.2040272736   -4.43769918
## [3,]   0.03987785   0.2040273 0.0003392275    0.07502662
## [4,]  69.43297130  -4.4376992 0.0750266164 4831.26850663

cov2cor(vcov(b0))

##            [,1]         [,2]       [,3]         [,4]
## [1,]  1.0000000 -0.220576032 0.37807704  0.174433237
## [2,] -0.2205760  1.000000000 0.73775749 -0.004252049
## [3,]  0.3780770  0.737757494 1.00000000  0.058605614
## [4,]  0.1744332 -0.004252049 0.05860561  1.000000000

# Extrair os vetores de estativativas bootstrap.
str(b0)

## List of 11
##  $ t0       : Named num [1:4] 622.9581 178.2519 0.0973 386.4818
##   ..- attr(*, "names")= chr [1:4] "Int" "Ass" "Mei" "s2"
##  $ t        : num [1:1999, 1:4] 626 622 622 622 619 ...
##  $ R        : num 1999
##  $ data     :'data.frame':   35 obs. of  2 variables:
##   ..$ A   : num [1:35] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ Gain: int [1:35] 644 631 661 624 633 610 615 605 608 599 ...
##  $ seed     : int [1:626] 403 624 -333952915 -1819506614 -759241405 1506082216 -515332215 -1086775882 620038335 308851700 ...
##  $ statistic:function (dataset, index)  
##   ..- attr(*, "srcref")=Class 'srcref'  atomic [1:8] 15 13 20 1 13 1 15 20
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x9508920> 
##  $ sim      : chr "ordinary"
##  $ call     : language boot(data = turk0, statistic = fitmodel, R = 1999)
##  $ stype    : chr "i"
##  $ strata   : num [1:35] 1 1 1 1 1 1 1 1 1 1 ...
##  $ weights  : num [1:35] 0.0286 0.0286 0.0286 0.0286 0.0286 ...
##  - attr(*, "class")= chr "boot"
##  - attr(*, "boot_type")= chr "boot"

colnames(b0$t) <- names(b0$t0)
best <- stack(as.data.frame(b0$t))
names(best)

## [1] "values" "ind"

st

##      R   original    bootBias    bootSE    bootMed
## 1 1999 622.958054   0.0225743  5.726720 622.752517
## 2 1999 178.251908   1.9825733 15.015128 179.120315
## 3 1999   0.097322   0.0025534  0.018418   0.097733
## 4 1999 386.481790 -31.1163672 69.507327 353.375813

densityplot(~values | ind,
            data = best,
            scales = "free",
            as.table = TRUE) +
    layer(panel.abline(v = st$original[which.packet()] +
                           c(0, st$bootBias[which.packet()]),
                       col = c(1, 2),
                       lty = 2))

# Intervalos de confiança.
confint(b0, type = "norm")

## Bootstrap quantiles, type =  normal 
## 
##            2.5 %      97.5 %
## Int 611.71131554 634.1596441
## Ass 146.84022450 205.6984457
## Mei   0.05866967   0.1308674
## s2  281.36629955 553.8300147

confint(b0, type = "basic")

## Bootstrap quantiles, type =  basic 
## 
##            2.5 %      97.5 %
## Int 610.46165695 633.4336766
## Ass 150.61507624 197.9774289
## Mei   0.06082153   0.1180598
## s2  271.84196772 546.0135969

confint(b0, type = "perc")

## Bootstrap quantiles, type =  percent 
## 
##            2.5 %      97.5 %
## Int 612.48243168 635.4544513
## Ass 158.52638777 205.8887405
## Mei   0.07658404   0.1338223
## s2  226.94998302 501.1216122

confint(b0, type = "bca")

## Bootstrap quantiles, type =  bca 
## 
##            2.5 %     97.5 %
## Int 613.44795481 636.850427
## Ass 155.64022730 201.246278
## Mei   0.07575608   0.131569
## s2  289.07574993 624.324303

Monte Carlo

Processo pontual

#-----------------------------------------------------------------------
# Teste para independência de processo pontual.

plot(NULL, NULL, xlim = c(0, 1), ylim = c(0, 1), asp = 1)
lines(x = c(0, 1, 1, 0, 0), y = c(0, 0, 1, 1, 0))

# xy <- locator(n = 20, type = "p", pch = 19)
# dput(lapply(xy, round, digits = 3))

xy <- structure(list(x = c(0.204, 0.186, 0.529, 0.529, 0.385, 0.579,
                           0.918, 0.798, 0.634, 0.761, 0.704, 0.485,
                           0.291, 0.341, 0.402, 0.833, 0.972, 0.625,
                           0.732, 0.315),
                     y = c(0.829, 0.545, 0.526, 0.752, 0.674, 0.648,
                           0.792, 0.33, 0.121, 0.127, 0.332, 0.352,
                           0.188, 0.452, 0.221, 0.524, 0.957, 0.964,
                           0.755, 0.332)),
                .Names = c("x", "y"))

#-----------------------------------------------------------------------

# Transforma a lista em matriz.
xy <- do.call(cbind, xy)

plot(xy,
     NULL,
     xlim = c(0, 1),
     ylim = c(0, 1),
     asp = 1,
     axes = FALSE,
     ann = FALSE)
lines(x = c(0, 1, 1, 0, 0),
      y = c(0, 0, 1, 1, 0),
      lty = 2)

# Calcula todas as distâncias (euclidianas).
d <- dist(xy)
d

##             1          2          3          4          5          6
## 2  0.28456985                                                       
## 3  0.44433546 0.34352584                                            
## 4  0.33399701 0.40062202 0.22600000                                 
## 5  0.23829813 0.23715396 0.20649455 0.16376813                      
## 6  0.41639645 0.40627331 0.13184840 0.11539497 0.19573451           
## 7  0.71495804 0.77254967 0.47125046 0.39105115 0.54590567 0.36831644
## 8  0.77578154 0.64866709 0.33283179 0.50044480 0.53749884 0.38611527
## 9  0.82835017 0.61683061 0.41838977 0.63967648 0.60647341 0.52986225
## 10 0.89613224 0.71087903 0.46154631 0.66667008 0.66376577 0.55187408
## 11 0.70498865 0.56008303 0.26126806 0.45500000 0.46768045 0.33982495
## 12 0.55361539 0.35587919 0.17947702 0.40241272 0.33717058 0.31056722
## 13 0.64687711 0.37212095 0.41338602 0.61216011 0.49500707 0.54271908
## 14 0.40112093 0.18075951 0.20203960 0.35403955 0.22631836 0.30831802
## 15 0.63942787 0.38939954 0.33038462 0.54597619 0.45331887 0.46223154
## 16 0.69904649 0.64734071 0.30400658 0.38000000 0.47244471 0.28265173
## 17 0.77859360 0.88743450 0.61806958 0.48813318 0.65165789 0.49993000
## 18 0.44211537 0.60686242 0.44839715 0.23272301 0.37643060 0.31933055
## 19 0.53316039 0.58499231 0.30602287 0.20302217 0.35632850 0.18670297
## 20 0.50924454 0.24901807 0.28884598 0.47137671 0.34909025 0.41176692
##             7          8          9         10         11         12
## 2                                                                   
## 3                                                                   
## 4                                                                   
## 5                                                                   
## 6                                                                   
## 7                                                                   
## 8  0.47733007                                                       
## 9  0.72862679 0.26566332                                            
## 10 0.68328179 0.20634437 0.12714165                                 
## 11 0.50734209 0.09402127 0.22230834 0.21277688                      
## 12 0.61732406 0.31377221 0.27488543 0.35609128 0.21991135           
## 13 0.87060037 0.52651021 0.34948247 0.47394198 0.43738427 0.25403149
## 14 0.66972308 0.47300423 0.44205203 0.53106026 0.38232055 0.17531686
## 15 0.76960834 0.41072740 0.25263412 0.37110241 0.32175301 0.15508062
## 16 0.28115654 0.19713194 0.44945523 0.40347615 0.23131148 0.38818552
## 17 0.17361164 0.65069578 0.90174276 0.85640002 0.68003603 0.77665565
## 18 0.33975432 0.65717958 0.84304804 0.84797700 0.63691836 0.62780889
## 19 0.18964440 0.43009418 0.64152942 0.62866923 0.42392570 0.47267113
## 20 0.75842534 0.48300414 0.38246830 0.49085741 0.38900000 0.17117243
##            13         14         15         16         17         18
## 2                                                                   
## 3                                                                   
## 4                                                                   
## 5                                                                   
## 6                                                                   
## 7                                                                   
## 8                                                                   
## 9                                                                   
## 10                                                                  
## 11                                                                  
## 12                                                                  
## 13                                                                  
## 14 0.26869313                                                       
## 15 0.11580155 0.23891840                                            
## 16 0.63769899 0.49724038 0.52684912                                 
## 17 1.02719132 0.80819923 0.93091138 0.45476367                      
## 18 0.84482661 0.58549125 0.77574351 0.48668676 0.34707060           
## 19 0.71831052 0.49466150 0.62773880 0.25211505 0.31369412 0.23479779
## 20 0.14598630 0.12278436 0.14103191 0.55243823 0.90679325 0.70393466
##            19
## 2            
## 3            
## 4            
## 5            
## 6            
## 7            
## 8            
## 9            
## 10           
## 11           
## 12           
## 13           
## 14           
## 15           
## 16           
## 17           
## 18           
## 19           
## 20 0.59398485

# Obtém a menor distância.
m <- min(d)

# Número de pontos.
n <- nrow(xy)

# Geração de estatísticas por simulação do modelo assumido sob
# hipótese nula: localização uniforme no quadrado.
M <- replicate(9999, {
    # As localizações de n pontos.
    loc <- cbind(x = runif(n),
                 y = runif(n))
    # A menor distância entre eles sob H_0.
    min(dist(loc))
})

# Concatena as estatísticas simuladas com a observada.
M <- c(m, M)

# Gráfico da distribuição acumulada empírica sob H_0.
plot(ecdf(M))
abline(h = c(0.025, 0.975), lty = 2, col = 2)
abline(v = m, col = 2)

# Gráfico da distribuição acumulada empírica sob H_0.
plot(density(M))
abline(v = m, col = 2)
abline(v = quantile(M, c(0.025, 0.975)), lty = 2, col = 2)
rug(M)

# P-valor.
2 * sum(M > m)/length(M)

## [1] 0.0098

#-----------------------------------------------------------------------
# Moficando a estatística de teste.

xy <- structure(list(x = c(0.088, 0.326, 0.577, 0.846, 0.857, 0.568,
                           0.306, 0.077, 0.077, 0.328, 0.597, 0.883,
                           0.863, 0.64, 0.337, 0.088, 0.077, 0.346,
                           0.654, 0.619),
                     y = c(0.92, 0.922, 0.916, 0.935, 0.737, 0.674,
                           0.67, 0.665, 0.452, 0.502, 0.461, 0.454,
                           0.256, 0.26, 0.26, 0.219, 0.045, 0.06, 0.058,
                           0.439)),
                .Names = c("x", "y"))

# Transforma a lista em matriz.
xy <- do.call(cbind, xy)

plot(xy,
     NULL,
     xlim = c(0, 1),
     ylim = c(0, 1),
     asp = 1,
     axes = FALSE,
     ann = FALSE)
lines(x = c(0, 1, 1, 0, 0),
      y = c(0, 0, 1, 1, 0),
      lty = 2)

# Todas as distância entre os pontos (a estatística de teste é um vetor
# e não um escalar).
d <- c(dist(xy))
d

##   [1] 0.2380084 0.4890164 0.7581484 0.7904745 0.5393663 0.3316987 0.2552371
##   [8] 0.4681293 0.4820000 0.6853919 0.9215102 1.0205494 0.8604092 0.7054084
##  [15] 0.7010000 0.8750691 0.8978664 1.0312129 0.7164649 0.2510717 0.5201625
##  [22] 0.5623042 0.3465083 0.2527924 0.3578407 0.5318844 0.4200048 0.5347541
##  [29] 0.7275115 0.8555262 0.7326937 0.6620914 0.7421947 0.9116633 0.8622320
##  [36] 0.9241645 0.5649230 0.2696702 0.3323266 0.2421673 0.3660014 0.5594649
##  [43] 0.6821261 0.4831118 0.4554393 0.5541480 0.7193024 0.6590182 0.6985242
##  [50] 0.8514282 1.0043112 0.8866211 0.8614482 0.4788455 0.1983053 0.3813201
##  [57] 0.6015189 0.8150221 0.9081024 0.6751392 0.5354223 0.4824210 0.6792128
##  [64] 0.7057344 0.8454029 1.0426984 1.1762062 1.0077822 0.8977711 0.5454769
##  [71] 0.2957871 0.5550586 0.7833160 0.8304366 0.5788489 0.3791781 0.2841918
##  [78] 0.4810374 0.5240401 0.7056408 0.9271920 1.0427195 0.8482040 0.7086960
##  [85] 0.3813765 0.2620305 0.4910825 0.5388553 0.2952694 0.2149651 0.3842200
##  [92] 0.5116141 0.4202142 0.4740854 0.6613811 0.7979486 0.6529012 0.6219743
##  [99] 0.2404704 0.2290546 0.3161724 0.1694344 0.3582764 0.6161047 0.6940065
## [106] 0.5288251 0.4111703 0.5009241 0.6656320 0.6113101 0.7040227 0.3890116
## [113] 0.2130000 0.2992825 0.5585839 0.8331608 0.8860457 0.6935373 0.4812744
## [120] 0.4461356 0.6200000 0.6621072 0.8374831 0.5872308 0.2559316 0.5200779
## [127] 0.8060025 0.8100691 0.5948386 0.3232089 0.2332595 0.4070000 0.4754209
## [134] 0.6986881 0.5421559 0.2721066 0.5570718 0.5888472 0.3948519 0.2421673
## [141] 0.3710647 0.5213924 0.4423664 0.5508285 0.2977415 0.2860857 0.3358288
## [148] 0.2055480 0.3286351 0.5636000 0.6659249 0.4730772 0.4070111 0.0311127
## [155] 0.1990075 0.3109421 0.5794411 0.8290054 0.9038346 0.6660368 0.4574462
## [162] 0.2644258 0.2230359 0.5260152 0.7758827 0.8138286 0.5529060 0.2878976
## [169] 0.3050000 0.3030000 0.5535206 0.6026558 0.3555784 0.2024846 0.1802276
## [176] 0.2523529 0.3373796 0.2002024 0.3758896 0.3340135 0.1743474 0.3030594
## [183] 0.5884531 0.5747704 0.2694179 0.5771464 0.6700746 0.3080065 0.4670867
## [190] 0.3826042

# Gerando estatísticas de teste por simulação.
D <- replicate(499, {
    dist(cbind(x = runif(n), y = runif(n)))
})

# Juntanto observado com simulado.
D <- cbind(d, D)
str(D)

##  num [1:190, 1:500] 0.238 0.489 0.758 0.79 0.539 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:500] "d" "" "" "" ...

plot(ecdf(D[, 1]), cex = 0)
for (i in 2:ncol(D)) {
    lines(ecdf(D[, i]), cex = 0, col = "gray50")
}
lines(ecdf(D[, 1]), cex = 0, col = 2, lwd = 2)

Amostragem por conjuntos ordenados

#-----------------------------------------------------------------------
# Amostragem por Conjuntos Ordenados.

# Extraí valores de uma população normal padrão.
rand <- function(n) {
    rnorm(n, 0, 1)
}
rand(10)

##  [1] -0.2458259  0.6357846 -0.4676933 -0.1834748  0.0672818  0.5451266
##  [7] -0.3683698 -0.8494036  0.7576407 -0.9766793

# Ranked Set Sampling (Perfect Ranking). Calcula a média.
rss <- function(m = 3) {
    x <- matrix(rand(m * m), nrow = m)
    x <- apply(x, MARGIN = 1, sort)
    mean(diag(x))
}
rss(m = 5)

## [1] -0.2394357

# Simple Random Sampling. Calcula a média.
srs <- function(m = 3) {
    mean(rand(m))
}
rss(m = 5)

## [1] 0.09585034

# Ranked Set Sampling with Unperfect Ranking. Calcula a média.
rssur <- function(m = 3, rho) {
    x <- MASS::mvrnorm(m * m,
                       mu = c(0, 0),
                       Sigma = matrix(c(1, rho, rho, 1), nrow = 2))
    g <- gl(m, m)
    b <- by(data = x,
            INDICES = g,
            FUN = function(y) {
                o <- order(y[, 2])
                # cbind(y[o, 1], y[o, 2])
                y[o, 1]
            }, simplify = TRUE)
    mean(diag(do.call(rbind, b)))
}
rssur(m = 5, rho = 0.95)

## [1] 0.006458089

#-----------------------------------------------------------------------
# Simulação.

# Tamanho da amostra final.
m <- 10

# Distribuição da média na amostra aleatória simples.
system.time(m1 <- replicate(1000, srs(m = m)))

##    user  system elapsed 
##   0.008   0.000   0.008

# Distribuição da média na amostra por conjuntos ordenados perfeito.
system.time(m2 <- replicate(1000, rss(m = m)))

##    user  system elapsed 
##   0.332   0.000   0.332

# Distribuição da média na amostra por conjuntos ordenados imperfeito.
system.time(m3 <- replicate(1000, rssur(m = m, rho = 0.75)))

##    user  system elapsed 
##   1.024   0.000   1.023

library(lattice)
library(latticeExtra)

densityplot(~m1 + m2 + m3,
            auto.key = TRUE,
            layout = c(NA, 1)) +
    layer(panel.abline(v = 0, lty = 2))

ecdfplot(~m1 + m2 + m3,
         auto.key = TRUE,
         layout = c(NA, 1))

qqmath(~m1 + m2 + m3,
       auto.key = TRUE,
       layout = c(NA, 1))

Métodos computacionalmente intensivos para inferência

Walmes Marques Zeviani

2018-06-30

Testes de aleatorização

Uma senhora toma chá

Teste para a diferença de médias

Teste para a correlação

Índice de Moran

Jackknife

Regressão não linear

Dose letal para 50% da população

Uma situação problemática

Bootstrap

Regressão não linear

Inferência para valores preditos

Usando o pacote `boot`

Monte Carlo

Processo pontual

Amostragem por conjuntos ordenados