Universidade Federal do Paraná
Curso de Estatística
CE 083 - Estatística Computacional I - 2014/2
Prof. Dr. Walmes Marques Zeviani

Aula 06

Tabela de conteúdo

Conversão entre formatos de objetos
- Vetor
- Arranjo e matriz
- Tabela
- Lista
Família de funções *apply

Conversão entre formatos de objetos

##-----------------------------------------------------------------------------
## Funções que fazem conversão. Conversões podem ser de conteúdo (ex:
## numeric -> character, factor -> integer) e podem ser formatos (ex:
## matrix -> vector, vector -> list). Outros tipos podem existir, como
## conversões de classe.

## Funções de conversão. Começam com `as.`.
a <- apropos("^as\\."); a

##   [1] "as.array"                      "as.array.default"             
##   [3] "as.call"                       "as.character"                 
##   [5] "as.character.condition"        "as.character.Date"            
##   [7] "as.character.default"          "as.character.error"           
##   [9] "as.character.factor"           "as.character.hexmode"         
##  [11] "as.character.numeric_version"  "as.character.octmode"         
##  [13] "as.character.POSIXt"           "as.character.srcref"          
##  [15] "as.complex"                    "as.data.frame"                
##  [17] "as.data.frame.array"           "as.data.frame.AsIs"           
##  [19] "as.data.frame.character"       "as.data.frame.complex"        
##  [21] "as.data.frame.data.frame"      "as.data.frame.Date"           
##  [23] "as.data.frame.default"         "as.data.frame.difftime"       
##  [25] "as.data.frame.factor"          "as.data.frame.integer"        
##  [27] "as.data.frame.list"            "as.data.frame.logical"        
##  [29] "as.data.frame.matrix"          "as.data.frame.model.matrix"   
##  [31] "as.data.frame.numeric"         "as.data.frame.numeric_version"
##  [33] "as.data.frame.ordered"         "as.data.frame.POSIXct"        
##  [35] "as.data.frame.POSIXlt"         "as.data.frame.raw"            
##  [37] "as.data.frame.table"           "as.data.frame.ts"             
##  [39] "as.data.frame.vector"          "as.Date"                      
##  [41] "as.Date.character"             "as.Date.date"                 
##  [43] "as.Date.dates"                 "as.Date.default"              
##  [45] "as.Date.factor"                "as.Date.numeric"              
##  [47] "as.Date.POSIXct"               "as.Date.POSIXlt"              
##  [49] "as.dendrogram"                 "as.difftime"                  
##  [51] "as.dist"                       "as.double"                    
##  [53] "as.double.difftime"            "as.double.POSIXlt"            
##  [55] "as.environment"                "as.expression"                
##  [57] "as.expression.default"         "as.factor"                    
##  [59] "as.formula"                    "as.function"                  
##  [61] "as.function.default"           "as.graphicsAnnot"             
##  [63] "as.hclust"                     "as.hexmode"                   
##  [65] "as.integer"                    "as.list"                      
##  [67] "as.list.data.frame"            "as.list.Date"                 
##  [69] "as.list.default"               "as.list.environment"          
##  [71] "as.list.factor"                "as.list.function"             
##  [73] "as.list.numeric_version"       "as.list.POSIXct"              
##  [75] "as.logical"                    "as.logical.factor"            
##  [77] "as.matrix"                     "as.matrix.data.frame"         
##  [79] "as.matrix.default"             "as.matrix.noquote"            
##  [81] "as.matrix.POSIXlt"             "as.name"                      
##  [83] "as.null"                       "as.null.default"              
##  [85] "as.numeric"                    "as.numeric_version"           
##  [87] "as.octmode"                    "as.ordered"                   
##  [89] "as.package_version"            "as.pairlist"                  
##  [91] "as.person"                     "as.personList"                
##  [93] "as.POSIXct"                    "as.POSIXct.date"              
##  [95] "as.POSIXct.Date"               "as.POSIXct.dates"             
##  [97] "as.POSIXct.default"            "as.POSIXct.numeric"           
##  [99] "as.POSIXct.POSIXlt"            "as.POSIXlt"                   
## [101] "as.POSIXlt.character"          "as.POSIXlt.date"              
## [103] "as.POSIXlt.Date"               "as.POSIXlt.dates"             
## [105] "as.POSIXlt.default"            "as.POSIXlt.factor"            
## [107] "as.POSIXlt.numeric"            "as.POSIXlt.POSIXct"           
## [109] "as.qr"                         "as.raster"                    
## [111] "as.raw"                        "as.relistable"                
## [113] "as.roman"                      "as.single"                    
## [115] "as.single.default"             "as.stepfun"                   
## [117] "as.symbol"                     "as.table"                     
## [119] "as.table.default"              "as.ts"                        
## [121] "as.vector"                     "as.vector.factor"

## Aquelas que contém no nome dos tipos de objetos já vistos.
grep("(list|vector|matrix|array|frame)", a, value=TRUE)

##  [1] "as.array"                      "as.array.default"             
##  [3] "as.data.frame"                 "as.data.frame.array"          
##  [5] "as.data.frame.AsIs"            "as.data.frame.character"      
##  [7] "as.data.frame.complex"         "as.data.frame.data.frame"     
##  [9] "as.data.frame.Date"            "as.data.frame.default"        
## [11] "as.data.frame.difftime"        "as.data.frame.factor"         
## [13] "as.data.frame.integer"         "as.data.frame.list"           
## [15] "as.data.frame.logical"         "as.data.frame.matrix"         
## [17] "as.data.frame.model.matrix"    "as.data.frame.numeric"        
## [19] "as.data.frame.numeric_version" "as.data.frame.ordered"        
## [21] "as.data.frame.POSIXct"         "as.data.frame.POSIXlt"        
## [23] "as.data.frame.raw"             "as.data.frame.table"          
## [25] "as.data.frame.ts"              "as.data.frame.vector"         
## [27] "as.list"                       "as.list.data.frame"           
## [29] "as.list.Date"                  "as.list.default"              
## [31] "as.list.environment"           "as.list.factor"               
## [33] "as.list.function"              "as.list.numeric_version"      
## [35] "as.list.POSIXct"               "as.matrix"                    
## [37] "as.matrix.data.frame"          "as.matrix.default"            
## [39] "as.matrix.noquote"             "as.matrix.POSIXlt"            
## [41] "as.pairlist"                   "as.relistable"                
## [43] "as.vector"                     "as.vector.factor"

Vetor

##-----------------------------------------------------------------------------
## Conversões de forma sobre um vetor.

x <- runif(12)

## Veja que é um vetor.
c(is.vector(x), is.matrix(x), is.array(x), is.list(x), is.data.frame(x))

## [1]  TRUE FALSE FALSE FALSE FALSE

## Vetor é a forma mais básica de organizar dados. Tem uma só dimensão e
## todos os elementos são do mesmo tipo de conteúdo. Assim, pode-se sem
## restrições convertê-lo para objetos de estrutura mais complexa.

##-----------------------------------------------------------------------------
## Para matriz.

y <- as.matrix(x)
c(is.vector(y), is.matrix(y), is.array(y), is.list(y), is.data.frame(y))

## [1] FALSE  TRUE  TRUE FALSE FALSE

class(y)

## [1] "matrix"

str(y)

##  num [1:12, 1] 0.739 0.838 0.626 0.714 0.189 ...

## Pode transformar em matriz forçando a existência de um atributo
## típico de matrizes, como nrow, ncol e dim.

y <- x
c(is.vector(y), is.matrix(y), is.array(y), is.list(y), is.data.frame(y))

## [1]  TRUE FALSE FALSE FALSE FALSE

dim(y)

## NULL

dim(y) <- c(3,4)
c(is.vector(y), is.matrix(y), is.array(y), is.list(y), is.data.frame(y))

## [1] FALSE  TRUE  TRUE FALSE FALSE

##        [,1]   [,2]   [,3]   [,4]
## [1,] 0.7391 0.7137 0.9478 0.1132
## [2,] 0.8381 0.1886 0.3553 0.3114
## [3,] 0.6262 0.2546 0.2190 0.7837

##-----------------------------------------------------------------------------
## Para arranjo (matriz é arranjo de duas dimensões).

y <- as.array(x)
c(is.vector(y), is.matrix(y), is.array(y), is.list(y), is.data.frame(y))

## [1] FALSE FALSE  TRUE FALSE FALSE

class(y)

## [1] "array"

str(y)

##  num [1:12(1d)] 0.739 0.838 0.626 0.714 0.189 ...

## Pode-se convertê-lo para array atribuindo dimensões.

y <- x
c(is.vector(y), is.matrix(y), is.array(y), is.list(y), is.data.frame(y))

## [1]  TRUE FALSE FALSE FALSE FALSE

dim(y)

## NULL

dim(y) <- c(3,2,2)
c(is.vector(y), is.matrix(y), is.array(y), is.list(y), is.data.frame(y))

## [1] FALSE FALSE  TRUE FALSE FALSE

## , , 1
## 
##        [,1]   [,2]
## [1,] 0.7391 0.7137
## [2,] 0.8381 0.1886
## [3,] 0.6262 0.2546
## 
## , , 2
## 
##        [,1]   [,2]
## [1,] 0.9478 0.1132
## [2,] 0.3553 0.3114
## [3,] 0.2190 0.7837

##-----------------------------------------------------------------------------
## Para data.frame (tabela).

y <- as.data.frame(x)
str(y)

## 'data.frame':    12 obs. of  1 variable:
##  $ x: num  0.739 0.838 0.626 0.714 0.189 ...

##         x
## 1  0.7391
## 2  0.8381
## 3  0.6262
## 4  0.7137
## 5  0.1886
## 6  0.2546
## 7  0.9478
## 8  0.3553
## 9  0.2190
## 10 0.1132
## 11 0.3114
## 12 0.7837

##-----------------------------------------------------------------------------
## Para lista.

y <- as.list(x)
str(y)

## List of 12
##  $ : num 0.739
##  $ : num 0.838
##  $ : num 0.626
##  $ : num 0.714
##  $ : num 0.189
##  $ : num 0.255
##  $ : num 0.948
##  $ : num 0.355
##  $ : num 0.219
##  $ : num 0.113
##  $ : num 0.311
##  $ : num 0.784

Arranjo e matriz

##-----------------------------------------------------------------------------
## Conversões a partir de matrizes e arranjos.

m <- matrix(runif(12), 3, 4)
m

##         [,1]   [,2]   [,3]   [,4]
## [1,] 0.02899 0.4460 0.6971 0.5420
## [2,] 0.73112 0.4200 0.1558 0.5841
## [3,] 0.23062 0.4027 0.6761 0.5728

c(is.vector(m), is.matrix(m), is.array(m), is.list(m), is.data.frame(m))

## [1] FALSE  TRUE  TRUE FALSE FALSE

##-----------------------------------------------------------------------------
## Para vetor.

n <- as.vector(m)
str(n)

##  num [1:12] 0.029 0.731 0.231 0.446 0.42 ...

## A matriz tem as colunas emplilhadas para gerar o vetor. Sempre será
## assim, as dimensões de maior ordem são desfeitas depois das de menor
## ordem. Para ficar claro, veja como fica com um arranjo de 4
## dimensões.

a <- array(sample(0:9, 3*3*2*2, repl=TRUE), dim=c(3,3,2,2))
str(a)

##  int [1:3, 1:3, 1:2, 1:2] 4 2 3 9 9 6 6 8 6 2 ...

## , , 1, 1
## 
##      [,1] [,2] [,3]
## [1,]    4    9    6
## [2,]    2    9    8
## [3,]    3    6    6
## 
## , , 2, 1
## 
##      [,1] [,2] [,3]
## [1,]    2    4    7
## [2,]    8    8    9
## [3,]    5    4    5
## 
## , , 1, 2
## 
##      [,1] [,2] [,3]
## [1,]    3    2    6
## [2,]    2    2    6
## [3,]    8    5    6
## 
## , , 2, 2
## 
##      [,1] [,2] [,3]
## [1,]    7    1    4
## [2,]    5    3    0
## [3,]    9    4    1

as.vector(a)

##  [1] 4 2 3 9 9 6 6 8 6 2 8 5 4 8 4 7 9 5 3 2 8 2 2 5 6 6 6 7 5 9 1 3 4 4 0 1

##-----------------------------------------------------------------------------
## Para data.frame.

n <- as.data.frame(m)
str(n)

## 'data.frame':    3 obs. of  4 variables:
##  $ V1: num  0.029 0.731 0.231
##  $ V2: num  0.446 0.42 0.403
##  $ V3: num  0.697 0.156 0.676
##  $ V4: num  0.542 0.584 0.573

n <- as.data.frame(t(m))
str(n)

## 'data.frame':    4 obs. of  3 variables:
##  $ V1: num  0.029 0.446 0.697 0.542
##  $ V2: num  0.731 0.42 0.156 0.584
##  $ V3: num  0.231 0.403 0.676 0.573

##-----------------------------------------------------------------------------
## Para lista.

n <- as.list(m)
str(n)

## List of 12
##  $ : num 0.029
##  $ : num 0.731
##  $ : num 0.231
##  $ : num 0.446
##  $ : num 0.42
##  $ : num 0.403
##  $ : num 0.697
##  $ : num 0.156
##  $ : num 0.676
##  $ : num 0.542
##  $ : num 0.584
##  $ : num 0.573

## Aqui vê-se que a matriz passou para vetor e depois para lista. Caso
## se quisesse que cada coluna (ou linha) fosse um item da lista
## teria-se que proceder de outra forma. Uma forma e passando antes para
## data.frame a outra é usando uma função da família *apply.

Tabela

##-----------------------------------------------------------------------------
## Conversões a partir de data.frames.

## d <- mtcars
d <- trees
c(is.vector(d), is.matrix(d), is.array(d), is.list(d), is.data.frame(d))

## [1] FALSE FALSE FALSE  TRUE  TRUE

str(d)

## 'data.frame':    31 obs. of  3 variables:
##  $ Girth : num  8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
##  $ Height: num  70 65 63 72 81 83 66 75 80 75 ...
##  $ Volume: num  10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...

e <- as.vector(d)
str(e)

## 'data.frame':    31 obs. of  3 variables:
##  $ Girth : num  8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
##  $ Height: num  70 65 63 72 81 83 66 75 80 75 ...
##  $ Volume: num  10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...

## Não deu efeito efeito. Caso realmente queira que um data.frame seja
## decomposto e vire vetor tem que antes passar por matriz. Mas deve ser
## lembrado que um data.frame permite que cada coluna tenha conteúdo de
## tipo próprio e numa matriz o conteúdo deve ser do mesmo tipo.

is.list(d)

## [1] TRUE

## Note que data.frame é um tipo especial de lista no qual as colunas
## são os itens e esses itens são todos vetores de mesmo comprimento, o
## que permite que eles sejam acomodados lado a lado (como coluna) em um
## data.frame.

as.list(d)

## $Girth
##  [1]  8.3  8.6  8.8 10.5 10.7 10.8 11.0 11.0 11.1 11.2 11.3 11.4 11.4 11.7 12.0 12.9 12.9
## [18] 13.3 13.7 13.8 14.0 14.2 14.5 16.0 16.3 17.3 17.5 17.9 18.0 18.0 20.6
## 
## $Height
##  [1] 70 65 63 72 81 83 66 75 80 75 79 76 76 69 75 74 85 86 71 64 78 80 74 72 77 81 82 80
## [29] 80 80 87
## 
## $Volume
##  [1] 10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 24.2 21.0 21.4 21.3 19.1 22.2 33.8
## [18] 27.4 25.7 24.9 34.5 31.7 36.3 38.3 42.6 55.4 55.7 58.3 51.5 51.0 77.0

Lista

##-----------------------------------------------------------------------------
## Conversões a partir de listas.

## Uma lista com conteúdo regular: os itens são vetores de mesmo tamanho
## e mesmo tipo de conteúdo.
l <- list(a=1:5, b=runif(5), c=sample(0:1, 5, repl=TRUE))
l

## $a
## [1] 1 2 3 4 5
## 
## $b
## [1] 0.71683 0.57932 0.12778 0.99206 0.06632
## 
## $c
## [1] 0 0 1 1 1

## as.vector(l)     ## Não.
## as.array(l)      ## Não.
as.data.frame(l) ## Sim.

##   a       b c
## 1 1 0.71683 0
## 2 2 0.57932 0
## 3 3 0.12778 1
## 4 4 0.99206 1
## 5 5 0.06632 1

## Conteúdo irregular: diferentes comprimentos ou tipos de formato.
l <- list(a=1:5, b=runif(5), c=sample(0:1, 5, repl=TRUE),
          d=matrix(1:9, 3, 3), e=head(iris))
l

## $a
## [1] 1 2 3 4 5
## 
## $b
## [1] 0.02573 0.03268 0.02440 0.79516 0.56491
## 
## $c
## [1] 0 1 1 1 1
## 
## $d
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## $e
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

as.data.frame(l)

## Error: arguments imply differing number of rows: 5, 3, 6

## Passar de um objeto mais simples para um mais complexo é possível,
## pois é *simples complicar* mas de um complexo para um simples requer
## que muitos requisitos sejam satisfeitos e por isso é *complicado
## simplificar*.

##-----------------------------------------------------------------------------
## Os objetos retornados por funções do R são em geral em forma de lista
## porque facilmente eles podem diferir em dimensão, tipo de conteúdo e
## forma, o que impede uso de estruturas simples. Abaixo de uma lista
## obtida ao fazer um histograma será gerada uma tabela com frequências
## e pontos centrais e extremos dos limites de cada classe.

## Um vetor com os valores de precipitação pluviométrica (chuva
## acumulada) para cidades dos USA.
str(precip)

##  Named num [1:70] 67 54.7 7 48.5 14 17.2 20.7 13 43.4 40.2 ...
##  - attr(*, "names")= chr [1:70] "Mobile" "Juneau" "Phoenix" "Little Rock" ...

h <- hist(precip, col="seagreen")

plot of chunk unnamed-chunk-6

str(h)

## List of 6
##  $ breaks  : num [1:8] 0 10 20 30 40 50 60 70
##  $ counts  : int [1:7] 4 9 5 25 21 5 1
##  $ density : num [1:7] 0.00571 0.01286 0.00714 0.03571 0.03 ...
##  $ mids    : num [1:7] 5 15 25 35 45 55 65
##  $ xname   : chr "precip"
##  $ equidist: logi TRUE
##  - attr(*, "class")= chr "histogram"

## Quer-se uma tabelas com os extremos e meio das classes além das
## frequências absolutas e relativas, simples e acumuladas.

## Pega os itens counts e mids.
l <- h[c(2,4)]
str(l)

## List of 2
##  $ counts: int [1:7] 4 9 5 25 21 5 1
##  $ mids  : num [1:7] 5 15 25 35 45 55 65

n <- length(h$mids); n ## Número de classes

## [1] 7

## Adiciona os limites inferiores e superiores das classes.
l$lwr <- h$breaks[1:n]
l$upr <- h$breaks[1:n+1]

## A frequência acumulada.
l$fac <- cumsum(l$counts)

## As relativas.
l$freq <- l$counts/sum(l$counts)
l$facr <- l$fac/sum(l$counts)

## A tabela desejada.
as.data.frame(l)

##   counts mids lwr upr fac    freq    facr
## 1      4    5   0  10   4 0.05714 0.05714
## 2      9   15  10  20  13 0.12857 0.18571
## 3      5   25  20  30  18 0.07143 0.25714
## 4     25   35  30  40  43 0.35714 0.61429
## 5     21   45  40  50  64 0.30000 0.91429
## 6      5   55  50  60  69 0.07143 0.98571
## 7      1   65  60  70  70 0.01429 1.00000

Família de funções `*apply`

##-----------------------------------------------------------------------------
## Os membros da família *apply.

cat(cbind(apropos("apply$")), sep="\n")

## apply
## dendrapply
## eapply
## kernapply
## lapply
## mapply
## .mapply
## rapply
## sapply
## tapply
## vapply

## De uso simples e frequente.
## apply  : on Arrays margins (não é aaply para evitar cacofonia).
## lapply : on Lists itens or vector elements or data.frame columns.
## sapply : lapply that Simplifies when possible.
## tapply : on a ragged array, return as Tabular format.

## De uso menos frequente.
## mapply : lapply over Multiple list or vector arguments.
## eapply : on Environments.
## rapply : Recursive.
## vapply : Vectorized.
## dendrapply : related to dendrogramns.
## kernapply : related kernel.

## Além destas, as funções by() e aggragate() são úteis e são agregados
## da notável família.

##-----------------------------------------------------------------------------
## tapply.

str(npk)

## 'data.frame':    24 obs. of  5 variables:
##  $ block: Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ N    : Factor w/ 2 levels "0","1": 1 2 1 2 2 2 1 1 1 2 ...
##  $ P    : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 2 2 2 ...
##  $ K    : Factor w/ 2 levels "0","1": 2 1 1 2 1 2 2 1 1 2 ...
##  $ yield: num  49.5 62.8 46.8 57 59.8 58.5 55.5 56 62.8 55.8 ...

## Média de yield para cada nível de N.
r <- tapply(npk$yield, npk$N, mean)
r

##     0     1 
## 52.07 57.68

class(r)

## [1] "array"

## Média de yield para combinando os níveis de N e P.
r <- tapply(npk$yield, list(npk$N, npk$P), mean)
r

##       0     1
## 0 51.72 52.42
## 1 59.22 56.15

## Melhor usa com with() para simplificar a declaração.
r <- with(npk, tapply(yield, list(N, P), mean))
r

##       0     1
## 0 51.72 52.42
## 1 59.22 56.15

## O mesmo considerando N, P e K.
r <- with(npk, tapply(yield, list(N, P, K), mean))
r

## , , 0
## 
##       0     1
## 0 51.43 54.33
## 1 63.77 57.93
## 
## , , 1
## 
##       0     1
## 0 52.00 50.50
## 1 54.67 54.37

## Nomes na lista geram nomes para as dimensões do array.
r <- with(npk, tapply(yield, list(Nitro=N, Phos=P, Pot=K), mean))
r

## , , Pot = 0
## 
##      Phos
## Nitro     0     1
##     0 51.43 54.33
##     1 63.77 57.93
## 
## , , Pot = 1
## 
##      Phos
## Nitro     0     1
##     0 52.00 50.50
##     1 54.67 54.37

##-----------------------------------------------------------------------------
## aggregate.

## A aggragate funciona com uso de formula, além de poder ser usada como a
## tapply(). O resultado é em data.frame.

s <- with(npk,
          aggregate(yield, list(Nitro=N, Phos=P, Pot=K), mean))
s

##   Nitro Phos Pot     x
## 1     0    0   0 51.43
## 2     1    0   0 63.77
## 3     0    1   0 54.33
## 4     1    1   0 57.93
## 5     0    0   1 52.00
## 6     1    0   1 54.67
## 7     0    1   1 50.50
## 8     1    1   1 54.37

s <- with(npk,
          aggregate(cbind(Y=yield), list(Nitro=N, Phos=P, Pot=K), mean))
s

##   Nitro Phos Pot     Y
## 1     0    0   0 51.43
## 2     1    0   0 63.77
## 3     0    1   0 54.33
## 4     1    1   0 57.93
## 5     0    0   1 52.00
## 6     1    0   1 54.67
## 7     0    1   1 50.50
## 8     1    1   1 54.37

## Será obtido o mesmo, mas usando uma fórmula para representar o que se
## deseja.

s <- aggregate(yield~N+P+K, data=npk, mean)
s

##   N P K yield
## 1 0 0 0 51.43
## 2 1 0 0 63.77
## 3 0 1 0 54.33
## 4 1 1 0 57.93
## 5 0 0 1 52.00
## 6 1 0 1 54.67
## 7 0 1 1 50.50
## 8 1 1 1 54.37

## Diferente da tapply, a aggregate pode ter mais de uma variáveis
## resposta. Por falta de outra variável resposta, será usando o log de
## yield.

s <- aggregate(cbind(y=yield, log.y=log(yield))~N+P+K, data=npk, mean)
s

##   N P K     y log.y
## 1 0 0 0 51.43 3.938
## 2 1 0 0 63.77 4.153
## 3 0 1 0 54.33 3.985
## 4 1 1 0 57.93 4.056
## 5 0 0 1 52.00 3.947
## 6 1 0 1 54.67 3.999
## 7 0 1 1 50.50 3.921
## 8 1 1 1 54.37 3.993

##-----------------------------------------------------------------------------
## by.

by(data=npk, INDICES=with(npk, N), FUN=nrow)

## with(npk, N): 0
## [1] 12
## ------------------------------------------------------------------- 
## with(npk, N): 1
## [1] 12

r <- with(npk, by(yield, INDICES=N, FUN=mean)); r

## N: 0
## [1] 52.07
## ------------------------------------------------------------------- 
## N: 1
## [1] 57.68

str(r)

##  by [1:2(1d)] 52.1 57.7
##  - attr(*, "dimnames")=List of 1
##   ..$ N: chr [1:2] "0" "1"
##  - attr(*, "call")= language by.default(data = yield, INDICES = N, FUN = mean)

c(is.list(r), is.array(r))

## [1] FALSE  TRUE

r <- with(npk, by(yield, INDICES=list(N=N, P=P, K=K), FUN=mean)); r

## N: 0
## P: 0
## K: 0
## [1] 51.43
## ------------------------------------------------------------------- 
## N: 1
## P: 0
## K: 0
## [1] 63.77
## ------------------------------------------------------------------- 
## N: 0
## P: 1
## K: 0
## [1] 54.33
## ------------------------------------------------------------------- 
## N: 1
## P: 1
## K: 0
## [1] 57.93
## ------------------------------------------------------------------- 
## N: 0
## P: 0
## K: 1
## [1] 52
## ------------------------------------------------------------------- 
## N: 1
## P: 0
## K: 1
## [1] 54.67
## ------------------------------------------------------------------- 
## N: 0
## P: 1
## K: 1
## [1] 50.5
## ------------------------------------------------------------------- 
## N: 1
## P: 1
## K: 1
## [1] 54.37

str(r)

##  by [1:2, 1:2, 1:2] 51.4 63.8 54.3 57.9 52 ...
##  - attr(*, "dimnames")=List of 3
##   ..$ N: chr [1:2] "0" "1"
##   ..$ P: chr [1:2] "0" "1"
##   ..$ K: chr [1:2] "0" "1"
##  - attr(*, "call")= language by.default(data = yield, INDICES = list(N = N, P = P, K = K), FUN = mean)

c(is.list(r), is.array(r))

## [1] FALSE  TRUE

class(r)   ## Como é de classe by ele é mostrado de forma diferente.

## [1] "by"

unclass(r) ## Se a classe é removida, então é mostrado como array comum.

## , , K = 0
## 
##    P
## N       0     1
##   0 51.43 54.33
##   1 63.77 57.93
## 
## , , K = 1
## 
##    P
## N       0     1
##   0 52.00 50.50
##   1 54.67 54.37
## 
## attr(,"call")
## by.default(data = yield, INDICES = list(N = N, P = P, K = K), 
##     FUN = mean)

##-----------------------------------------------------------------------------
## Resumindo.

## As funções tapply, aggregate e by fazem tarefas por estrato. Em
## outras palavras, separam os valores em um (ou mais) vetores
## respeitando valores em outro (ou mais) e em seguida aplicam uma
## função. A diferença é como declarar e o que é retornado.

##-----------------------------------------------------------------------------
## apply.

Titanic

## , , Age = Child, Survived = No
## 
##       Sex
## Class  Male Female
##   1st     0      0
##   2nd     0      0
##   3rd    35     17
##   Crew    0      0
## 
## , , Age = Adult, Survived = No
## 
##       Sex
## Class  Male Female
##   1st   118      4
##   2nd   154     13
##   3rd   387     89
##   Crew  670      3
## 
## , , Age = Child, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st     5      1
##   2nd    11     13
##   3rd    13     14
##   Crew    0      0
## 
## , , Age = Adult, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st    57    140
##   2nd    14     80
##   3rd    75     76
##   Crew  192     20

str(Titanic)

##  table [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
##  - attr(*, "dimnames")=List of 4
##   ..$ Class   : chr [1:4] "1st" "2nd" "3rd" "Crew"
##   ..$ Sex     : chr [1:2] "Male" "Female"
##   ..$ Age     : chr [1:2] "Child" "Adult"
##   ..$ Survived: chr [1:2] "No" "Yes"

is.array(Titanic)

## [1] TRUE

dimnames(Titanic)

## $Class
## [1] "1st"  "2nd"  "3rd"  "Crew"
## 
## $Sex
## [1] "Male"   "Female"
## 
## $Age
## [1] "Child" "Adult"
## 
## $Survived
## [1] "No"  "Yes"

sum(Titanic[ ,1, , ]) ## Total de homens.

## [1] 1731

sum(Titanic[ ,2, , ]) ## Total de mulheres.

## [1] 470

apply(Titanic, MARGIN=2, sum) ## Totais das margens para Sex.

##   Male Female 
##   1731    470

apply(Titanic, MARGIN=1, sum) ## Para Class.

##  1st  2nd  3rd Crew 
##  325  285  706  885

apply(Titanic, MARGIN=c(1,2), sum) ## Class e Sex.

##       Sex
## Class  Male Female
##   1st   180    145
##   2nd   179    106
##   3rd   510    196
##   Crew  862     23

apply(Titanic, MARGIN=c(2,4), sum) ## Sex e Survived.

##         Survived
## Sex        No Yes
##   Male   1364 367
##   Female  126 344

apply(Titanic, MARGIN=c(3,4), sum) ## Age e Survived.

##        Survived
## Age       No Yes
##   Child   52  57
##   Adult 1438 654

str(HairEyeColor)

##  table [1:4, 1:4, 1:2] 32 53 10 3 11 50 10 30 10 25 ...
##  - attr(*, "dimnames")=List of 3
##   ..$ Hair: chr [1:4] "Black" "Brown" "Red" "Blond"
##   ..$ Eye : chr [1:4] "Brown" "Blue" "Hazel" "Green"
##   ..$ Sex : chr [1:2] "Male" "Female"

dimnames(HairEyeColor)

## $Hair
## [1] "Black" "Brown" "Red"   "Blond"
## 
## $Eye
## [1] "Brown" "Blue"  "Hazel" "Green"
## 
## $Sex
## [1] "Male"   "Female"

apply(HairEyeColor, MARGIN=1, sum) ## Por cor de cabelo.

## Black Brown   Red Blond 
##   108   286    71   127

apply(HairEyeColor, MARGIN=2, sum) ## Por cor de olhos.

## Brown  Blue Hazel Green 
##   220   215    93    64

apply(HairEyeColor, MARGIN=3, sum) ## Por cor de sexo.

##   Male Female 
##    279    313

##-----------------------------------------------------------------------------
## lapply e sapply.

is.list(rock)

## [1] TRUE

str(rock) ## Todas as colunas tem conteúdo numérico.

## 'data.frame':    48 obs. of  4 variables:
##  $ area : int  4990 7002 7558 7352 7943 7979 9333 8209 8393 6425 ...
##  $ peri : num  2792 3893 3931 3869 3949 ...
##  $ shape: num  0.0903 0.1486 0.1833 0.1171 0.1224 ...
##  $ perm : num  6.3 6.3 6.3 6.3 17.1 17.1 17.1 17.1 119 119 ...

lapply(rock, mean)  ## Média

## $area
## [1] 7188
## 
## $peri
## [1] 2682
## 
## $shape
## [1] 0.2181
## 
## $perm
## [1] 415.4

lapply(rock, range) ## Extremos.

## $area
## [1]  1016 12212
## 
## $peri
## [1]  308.6 4864.2
## 
## $shape
## [1] 0.09033 0.46413
## 
## $perm
## [1]    6.3 1300.0

## Porque trata-se de um data.frame, dá pra usar apply também.
apply(rock, MARGIN=2, mean)

##      area      peri     shape      perm 
## 7187.7292 2682.2119    0.2181  415.4500

apply(rock, MARGIN=2, range)

##       area   peri   shape   perm
## [1,]  1016  308.6 0.09033    6.3
## [2,] 12212 4864.2 0.46413 1300.0

sapply(rock, mean)  ## Foi possível simplificar para um vetor.

##      area      peri     shape      perm 
## 7187.7292 2682.2119    0.2181  415.4500

sapply(rock, range) ## Foi possível simplificar para uma matriz.

##       area   peri   shape   perm
## [1,]  1016  308.6 0.09033    6.3
## [2,] 12212 4864.2 0.46413 1300.0

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

lapply(iris, is.numeric) ## Quais colunas tem conteúdo numérico?

## $Sepal.Length
## [1] TRUE
## 
## $Sepal.Width
## [1] TRUE
## 
## $Petal.Length
## [1] TRUE
## 
## $Petal.Width
## [1] TRUE
## 
## $Species
## [1] FALSE

lapply(iris, class)      ## Qual a classe?

## $Sepal.Length
## [1] "numeric"
## 
## $Sepal.Width
## [1] "numeric"
## 
## $Petal.Length
## [1] "numeric"
## 
## $Petal.Width
## [1] "numeric"
## 
## $Species
## [1] "factor"

## Também se pode usar apply pois iris é um data.frame.
apply(iris, MARGIN=2, is.numeric)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##        FALSE        FALSE        FALSE        FALSE        FALSE

sapply(iris, class) ## Foi possível simplificar.

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"     "factor"

## Não foi possível simplificar.
sapply(iris, summary)

## $Sepal.Length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.30    5.10    5.80    5.84    6.40    7.90 
## 
## $Sepal.Width
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.80    3.00    3.06    3.30    4.40 
## 
## $Petal.Length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.60    4.35    3.76    5.10    6.90 
## 
## $Petal.Width
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.1     0.3     1.3     1.2     1.8     2.5 
## 
## $Species
##     setosa versicolor  virginica 
##         50         50         50

## Separar as colunas que são númericas e então pedir o summary.
i <- sapply(iris, is.numeric); i

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##         TRUE         TRUE         TRUE         TRUE        FALSE

sapply(iris[,i], summary) ## Simplificou.

##         Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min.            4.30        2.00         1.00         0.1
## 1st Qu.         5.10        2.80         1.60         0.3
## Median          5.80        3.00         4.35         1.3
## Mean            5.84        3.06         3.76         1.2
## 3rd Qu.         6.40        3.30         5.10         1.8
## Max.            7.90        4.40         6.90         2.5

##-----------------------------------------------------------------------------
## Para praticar, considere os conjuntos de dados disponíveis no pacote
## datasets do R. O nome dos objetos com dados estão listados abaixo.

a <- ls("package:datasets")
a

##   [1] "ability.cov"           "airmiles"              "AirPassengers"        
##   [4] "airquality"            "anscombe"              "attenu"               
##   [7] "attitude"              "austres"               "beaver1"              
##  [10] "beaver2"               "BJsales"               "BJsales.lead"         
##  [13] "BOD"                   "cars"                  "ChickWeight"          
##  [16] "chickwts"              "co2"                   "CO2"                  
##  [19] "crimtab"               "discoveries"           "DNase"                
##  [22] "esoph"                 "euro"                  "euro.cross"           
##  [25] "eurodist"              "EuStockMarkets"        "faithful"             
##  [28] "fdeaths"               "Formaldehyde"          "freeny"               
##  [31] "freeny.x"              "freeny.y"              "HairEyeColor"         
##  [34] "Harman23.cor"          "Harman74.cor"          "Indometh"             
##  [37] "infert"                "InsectSprays"          "iris"                 
##  [40] "iris3"                 "islands"               "JohnsonJohnson"       
##  [43] "LakeHuron"             "ldeaths"               "lh"                   
##  [46] "LifeCycleSavings"      "Loblolly"              "longley"              
##  [49] "lynx"                  "mdeaths"               "morley"               
##  [52] "mtcars"                "nhtemp"                "Nile"                 
##  [55] "nottem"                "npk"                   "occupationalStatus"   
##  [58] "Orange"                "OrchardSprays"         "PlantGrowth"          
##  [61] "precip"                "presidents"            "pressure"             
##  [64] "Puromycin"             "quakes"                "randu"                
##  [67] "rivers"                "rock"                  "Seatbelts"            
##  [70] "sleep"                 "stackloss"             "stack.loss"           
##  [73] "stack.x"               "state.abb"             "state.area"           
##  [76] "state.center"          "state.division"        "state.name"           
##  [79] "state.region"          "state.x77"             "sunspot.month"        
##  [82] "sunspots"              "sunspot.year"          "swiss"                
##  [85] "Theoph"                "Titanic"               "ToothGrowth"          
##  [88] "treering"              "trees"                 "UCBAdmissions"        
##  [91] "UKDriverDeaths"        "UKgas"                 "USAccDeaths"          
##  [94] "USArrests"             "USJudgeRatings"        "USPersonalExpenditure"
##  [97] "uspop"                 "VADeaths"              "volcano"              
## [100] "warpbreaks"            "women"                 "WorldPhones"          
## [103] "WWWusage"

## Alguns são vetores, outros arrays, outros data.frames e poucos são
## listas. Abaixo segue uma tabela com a relação. Dentre eles alguns são
## séries temporais onde valem as regras para vetor.

i <- sapply(a,
            function(x){
                x <- eval(parse(text=x))
                oq <- c(is.vector(x), is.ts(x), is.array(x),
                        is.data.frame(x), is.list(x))
                paste(c("V","S","A","D","L")[oq], collapse="")
            })

res <- data.frame(objeto=a, classe=i, stringsAsFactors=FALSE)
res <- res[order(res$classe),]
rownames(res) <- NULL

## V: vetor; S: série; A: array/matriz; D: data.frame; L: lista. 
by(res$objeto, res$classe, as.vector)

## res$classe: 
## [1] "eurodist"       "state.division" "state.region"  
## ------------------------------------------------------------------- 
## res$classe: A
##  [1] "crimtab"               "euro.cross"            "freeny.x"             
##  [4] "HairEyeColor"          "iris3"                 "occupationalStatus"   
##  [7] "stack.x"               "state.x77"             "Titanic"              
## [10] "UCBAdmissions"         "USPersonalExpenditure" "VADeaths"             
## [13] "volcano"               "WorldPhones"          
## ------------------------------------------------------------------- 
## res$classe: DL
##  [1] "airquality"       "anscombe"         "attenu"           "attitude"        
##  [5] "beaver1"          "beaver2"          "BOD"              "cars"            
##  [9] "ChickWeight"      "chickwts"         "CO2"              "DNase"           
## [13] "esoph"            "faithful"         "Formaldehyde"     "freeny"          
## [17] "Indometh"         "infert"           "InsectSprays"     "iris"            
## [21] "LifeCycleSavings" "Loblolly"         "longley"          "morley"          
## [25] "mtcars"           "npk"              "Orange"           "OrchardSprays"   
## [29] "PlantGrowth"      "pressure"         "Puromycin"        "quakes"          
## [33] "randu"            "rock"             "sleep"            "stackloss"       
## [37] "swiss"            "Theoph"           "ToothGrowth"      "trees"           
## [41] "USArrests"        "USJudgeRatings"   "warpbreaks"       "women"           
## ------------------------------------------------------------------- 
## res$classe: S
##  [1] "airmiles"       "AirPassengers"  "austres"        "BJsales"        "BJsales.lead"  
##  [6] "co2"            "discoveries"    "fdeaths"        "freeny.y"       "JohnsonJohnson"
## [11] "LakeHuron"      "ldeaths"        "lh"             "lynx"           "mdeaths"       
## [16] "nhtemp"         "Nile"           "nottem"         "presidents"     "sunspot.month" 
## [21] "sunspots"       "sunspot.year"   "treering"       "UKDriverDeaths" "UKgas"         
## [26] "USAccDeaths"    "uspop"          "WWWusage"      
## ------------------------------------------------------------------- 
## res$classe: SA
## [1] "EuStockMarkets" "Seatbelts"     
## ------------------------------------------------------------------- 
## res$classe: V
## [1] "euro"       "islands"    "precip"     "rivers"     "stack.loss" "state.abb" 
## [7] "state.area" "state.name"
## ------------------------------------------------------------------- 
## res$classe: VL
## [1] "ability.cov"  "Harman23.cor" "Harman74.cor" "state.center"

Aula 06

Conversão entre formatos de objetos

Vetor

Arranjo e matriz

Tabela

Lista

Família de funções *apply

Como é legal a família *apply.

Família de funções `*apply`

Como é legal a família `*apply`.