Universidade Federal do Paraná
Curso de Estatística
CE 083 - Estatística Computacional I - 2014/2
Prof. Dr. Walmes Marques Zeviani

Aula 04

Tabela de conteúdo

Tipos de conteúdo para vetor
Conversões entre tipos de conteúdo
Objetos do tipo fator
Edição de fatores: rotulação e ordenação

Tipos de conteúdo para vetor

##-----------------------------------------------------------------------------
## Tipos de conteúdo.

## Númerico, números reais.
x <- c(2, 5, 7, 3, 9)
c(class(x), typeof(x))

## [1] "numeric" "double"

str(x)

##  num [1:5] 2 5 7 3 9

object.size(x)

## 88 bytes

dput(x)

## c(2, 5, 7, 3, 9)

## Inteiros. Entenda o L como literal ou limitado, serve para
## "diferenciar" o valor 2 de 2.0000000000.
x <- c(2L, 5L, 7L, 3L, 9L)
c(class(x), typeof(x))

## [1] "integer" "integer"

str(x)

##  int [1:5] 2 5 7 3 9

object.size(x)

## 72 bytes

dput(x)

## c(2L, 5L, 7L, 3L, 9L)

## Lógico.
y <- x<=7
c(class(y), typeof(y))

## [1] "logical" "logical"

str(y)

##  logi [1:5] TRUE TRUE TRUE TRUE FALSE

object.size(y)

## 72 bytes

dput(y)

## c(TRUE, TRUE, TRUE, TRUE, FALSE)

## Cadeia de caracteres alpha númericos, por isso chamado de string.
z <- c("2","5","7","3","9")
c(class(z), typeof(z))

## [1] "character" "character"

str(z)

##  chr [1:5] "2" "5" "7" "3" "9"

object.size(z)

## 328 bytes

dput(z)

## c("2", "5", "7", "3", "9")

## Existem outros tipos de conteúdo para vetor que veremos à seguir,
## como os fatores e datas. O importante é saber as diferenças práticas
## entre os formatos, como espaço em ocupado em disco, relevancia para
## represetnação de valores e como converter de um formato para outro
## quando necessário.

Conversões entre tipos de conteúdo

##-----------------------------------------------------------------------------
## Criando um vetor de cada tipo para conveter entre formatos.

num1 <- c(3,6,3,7,9,0)
num2 <- round(10*runif(7), 1)
int1 <- c(3L,6L,3L,7L,9L,0L)
int2 <- 3:9
logi <- sample(c(TRUE, FALSE), 8, repl=TRUE)
chr1 <- c("3","6","3","7","9","0")
chr2 <- c("es","ta","tis","ti","ca")

##-----------------------------------------------------------------------------
## Conversões.

x <- as.integer(num1)
dput(x)

## c(3L, 6L, 3L, 7L, 9L, 0L)

## Ops! Truncou.
num2

## [1] 8.4 2.9 7.6 9.0 3.6 4.1 2.2

x <- as.integer(num2)
dput(x)

## c(8L, 2L, 7L, 9L, 3L, 4L, 2L)

x <- as.numeric(chr1)
dput(x)

## c(3, 6, 3, 7, 9, 0)

x <- as.integer(chr1)
dput(x)

## c(3L, 6L, 3L, 7L, 9L, 0L)

x <- as.integer(chr2)

## Warning: NAs introduced by coercion

## [1] NA NA NA NA NA

logi

## [1]  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE

x <- as.numeric(logi)
dput(x)

## c(1, 0, 0, 1, 1, 0, 1, 0)

x <- as.integer(logi)
dput(x)

## c(1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L)

x <- as.character(num2)
x

## [1] "8.4" "2.9" "7.6" "9"   "3.6" "4.1" "2.2"

x <- as.character(logi)
x

## [1] "TRUE"  "FALSE" "FALSE" "TRUE"  "TRUE"  "FALSE" "TRUE"  "FALSE"

##-----------------------------------------------------------------------------
## Quais as opções de conversão existentes?

apropos("^as\\.")

##   [1] "as.array"                      "as.array.default"             
##   [3] "as.call"                       "as.character"                 
##   [5] "as.character.condition"        "as.character.Date"            
##   [7] "as.character.default"          "as.character.error"           
##   [9] "as.character.factor"           "as.character.hexmode"         
##  [11] "as.character.numeric_version"  "as.character.octmode"         
##  [13] "as.character.POSIXt"           "as.character.srcref"          
##  [15] "as.complex"                    "as.data.frame"                
##  [17] "as.data.frame.array"           "as.data.frame.AsIs"           
##  [19] "as.data.frame.character"       "as.data.frame.complex"        
##  [21] "as.data.frame.data.frame"      "as.data.frame.Date"           
##  [23] "as.data.frame.default"         "as.data.frame.difftime"       
##  [25] "as.data.frame.factor"          "as.data.frame.integer"        
##  [27] "as.data.frame.list"            "as.data.frame.logical"        
##  [29] "as.data.frame.matrix"          "as.data.frame.model.matrix"   
##  [31] "as.data.frame.numeric"         "as.data.frame.numeric_version"
##  [33] "as.data.frame.ordered"         "as.data.frame.POSIXct"        
##  [35] "as.data.frame.POSIXlt"         "as.data.frame.raw"            
##  [37] "as.data.frame.table"           "as.data.frame.ts"             
##  [39] "as.data.frame.vector"          "as.Date"                      
##  [41] "as.Date.character"             "as.Date.date"                 
##  [43] "as.Date.dates"                 "as.Date.default"              
##  [45] "as.Date.factor"                "as.Date.numeric"              
##  [47] "as.Date.POSIXct"               "as.Date.POSIXlt"              
##  [49] "as.dendrogram"                 "as.difftime"                  
##  [51] "as.dist"                       "as.double"                    
##  [53] "as.double.difftime"            "as.double.POSIXlt"            
##  [55] "as.environment"                "as.expression"                
##  [57] "as.expression.default"         "as.factor"                    
##  [59] "as.formula"                    "as.function"                  
##  [61] "as.function.default"           "as.graphicsAnnot"             
##  [63] "as.hclust"                     "as.hexmode"                   
##  [65] "as.integer"                    "as.list"                      
##  [67] "as.list.data.frame"            "as.list.Date"                 
##  [69] "as.list.default"               "as.list.environment"          
##  [71] "as.list.factor"                "as.list.function"             
##  [73] "as.list.numeric_version"       "as.list.POSIXct"              
##  [75] "as.logical"                    "as.logical.factor"            
##  [77] "as.matrix"                     "as.matrix.data.frame"         
##  [79] "as.matrix.default"             "as.matrix.noquote"            
##  [81] "as.matrix.POSIXlt"             "as.name"                      
##  [83] "as.null"                       "as.null.default"              
##  [85] "as.numeric"                    "as.numeric_version"           
##  [87] "as.octmode"                    "as.ordered"                   
##  [89] "as.package_version"            "as.pairlist"                  
##  [91] "as.person"                     "as.personList"                
##  [93] "as.POSIXct"                    "as.POSIXct.date"              
##  [95] "as.POSIXct.Date"               "as.POSIXct.dates"             
##  [97] "as.POSIXct.default"            "as.POSIXct.numeric"           
##  [99] "as.POSIXct.POSIXlt"            "as.POSIXlt"                   
## [101] "as.POSIXlt.character"          "as.POSIXlt.date"              
## [103] "as.POSIXlt.Date"               "as.POSIXlt.dates"             
## [105] "as.POSIXlt.default"            "as.POSIXlt.factor"            
## [107] "as.POSIXlt.numeric"            "as.POSIXlt.POSIXct"           
## [109] "as.qr"                         "as.raster"                    
## [111] "as.raw"                        "as.relistable"                
## [113] "as.roman"                      "as.single"                    
## [115] "as.single.default"             "as.stepfun"                   
## [117] "as.symbol"                     "as.table"                     
## [119] "as.table.default"              "as.ts"                        
## [121] "as.vector"                     "as.vector.factor"

## As conversão são entre tipos de conteúdo (ex: num -> int) e entre
## tipos de forma (ex: vetor -> matriz, lista -> vetor, etc).

Objetos do tipo fator

##-----------------------------------------------------------------------------
## Criando como fator.

x <- factor(c("B", "B", "B", "C", "C", "A", "C", "B", "A", "B"))
c(class(x), typeof(x))

## [1] "factor"  "integer"

str(x)

##  Factor w/ 3 levels "A","B","C": 2 2 2 3 3 1 3 2 1 2

dput(x)

## structure(c(2L, 2L, 2L, 3L, 3L, 1L, 3L, 2L, 1L, 2L), .Label = c("A", 
## "B", "C"), class = "factor")

levels(x)

## [1] "A" "B" "C"

nlevels(x)

## [1] 3

##-----------------------------------------------------------------------------
## Fazendo conversão em vetor com outro tipo de conteúdo.

x <- sample(c("A","B","C"), 10, repl=TRUE)
str(x)

##  chr [1:10] "B" "C" "A" "B" "C" "B" "C" "C" "C" "C"

x <- as.factor(x)
str(x)

##  Factor w/ 3 levels "A","B","C": 2 3 1 2 3 2 3 3 3 3

levels(x)

## [1] "A" "B" "C"

nlevels(x)

## [1] 3

##-----------------------------------------------------------------------------
## Quando considerar os dados como fator?

## * Quando o número de classes é finito e possívelmente conhecido antes
##   mesmo da coleta dos dados, ex: os Estados do Brasil, os níveis de
##   satisfação {ruim, regular, bom, ótimo}, o sexo do indivíduo.
## * Haja visto que o número de níveis é menor que o número de elementos
##   da população/amostra então vai haver repetições dos níveis na
##   amostra e existe uma economia de espaço em disco quando ele for
##   fator pelo fato de se associar um número inteiro à cada nível
##   (barato) e manter uma legenda ligando tais números aos rótulos.

## Veja a questão do número de níveis (k) versus tamanho da amostra (n).

x <- sample(c("A","B","C"), 10, repl=TRUE)
c(object.size(x), object.size(as.factor(x))) ## Sem vantagem, n/k pequeno.

## [1] 312 624

x <- sample(c("A","B","C"), 100, repl=TRUE)
c(object.size(x), object.size(as.factor(x))) ## Empate, n/k moderado.

## [1] 984 976

x <- sample(c("A","B","C"), 1000, repl=TRUE)
c(object.size(x), object.size(as.factor(x))) ## Com vantagem, n/k grande.

## [1] 8184 4576

LETTERS ## O alfabeto.

##  [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U"
## [22] "V" "W" "X" "Y" "Z"

x <- sample(LETTERS, 1000, replace=TRUE)
c(object.size(x), object.size(as.factor(x))) ## Com vantagem.

## [1] 9288 5856

Edição de fatores: rotulação e ordenação

##-----------------------------------------------------------------------------
## Alterar a ordem dos níveis (re-ordenar).

satis <- c("pessimo","ruim","bom","regular","otimo")
x <- as.factor(c(satis, sample(satis, 7, repl=TRUE)))
x

##  [1] pessimo ruim    bom     regular otimo   bom     otimo   otimo   bom     otimo  
## [11] pessimo regular
## Levels: bom otimo pessimo regular ruim

## A ordem padrão é alphanumérica. Para alterar usa-se o argumento
## levels da função factor().

y <- factor(x, levels=c("pessimo","ruim","bom","regular","otimo"))
y <- factor(x, levels=levels(x)[c(3,5,4,1,2)]) ## Mais rápido.
levels(x)

## [1] "bom"     "otimo"   "pessimo" "regular" "ruim"

levels(y)

## [1] "pessimo" "ruim"    "regular" "bom"     "otimo"

table(x)

## x
##     bom   otimo pessimo regular    ruim 
##       3       4       2       2       1

table(y)

## y
## pessimo    ruim regular     bom   otimo 
##       2       1       2       3       4

##-----------------------------------------------------------------------------
## Alterar o rótulo dos níveis (re-rotular).

cores <- c("red","green","blue","yellow")
x <- as.factor(c(cores, sample(cores, 8, repl=TRUE)))
str(x)

##  Factor w/ 4 levels "blue","green",..: 3 2 1 4 2 4 4 4 3 3 ...

levels(x)

## [1] "blue"   "green"  "red"    "yellow"

y <- factor(x, labels=c("Azul","Verde","Vermelho","Amarelo"))
str(y)

##  Factor w/ 4 levels "Azul","Verde",..: 3 2 1 4 2 4 4 4 3 3 ...

table(x)

## x
##   blue  green    red yellow 
##      1      3      4      4

table(y)

## y
##     Azul    Verde Vermelho  Amarelo 
##        1        3        4        4

##-----------------------------------------------------------------------------
## Alterar o rótulo e a ordem ao mesmo tempo.

z <- factor(x,
            levels=c("yellow","red","green","blue"),
            labels=c("Amarelo","Vermelho","Verde","Azul"))
str(z)

##  Factor w/ 4 levels "Amarelo","Vermelho",..: 2 3 4 1 3 1 1 1 2 2 ...

table(z)

## z
##  Amarelo Vermelho    Verde     Azul 
##        4        4        3        1

##-----------------------------------------------------------------------------
## A documentação da função levels() aponta ainda outras maneiras de
## proceder.

## help(levels, help_type="html")

## Atribuir níveis individualmente.
x <- gl(2, 4) ## equivalmente à as.factor(rep(1:2, each=4))
levels(x)[1] <- "low"
levels(x)[2] <- "high"
x

## [1] low  low  low  low  high high high high
## Levels: low high

## Atribuir conjuntamente.
y <- gl(2, 4)
levels(y) <- c("low", "high")
y

## [1] low  low  low  low  high high high high
## Levels: low high

## Combinar/aglutinar níveis.
z <- gl(3, 2)
levels(z)

## [1] "1" "2" "3"

levels(z) <- c("A", "B", "A")
z

## [1] A A B B A A
## Levels: A B

## A mesma coisa mas usando uma lista com nomes.
z <- gl(3, 2)
levels(z) <- list(A=c(1,3), B=2)
z

## [1] A A B B A A
## Levels: A B

## Para adicionar níveis.
f <- factor(c("a","b"))
levels(f)

## [1] "a" "b"

levels(f) <- c("a", "b", "c")
f

## [1] a b
## Levels: a b c

## Para renomear níveis.
f <- factor(c("a","b"))
levels(f)

## [1] "a" "b"

levels(f) <- list(C="C", A="a", B="b")
f

## [1] A B
## Levels: C A B

## Para abandonar níveis.
z <- gl(3, 2)
levels(z)

## [1] "1" "2" "3"

levels(z)[3] <- NA
levels(z)

## [1] "1" "2"

## [1] 1    1    2    2    <NA> <NA>
## Levels: 1 2

## A função combine_factor() do pacote reshape é uma alternativa
## interessante para fazer aglutinação dos níveis de um fator. Será
## apresentada no futuro.

##-----------------------------------------------------------------------------
## Alterar com levels().

x <- gl(3, 2, labels=c("A","B","C")); x

## [1] A A B B C C
## Levels: A B C

levels(x) <- c("Andre","Bruno","Caio"); x

## [1] Andre Andre Bruno Bruno Caio  Caio 
## Levels: Andre Bruno Caio

Aula 04

Tipos de conteúdo para vetor

Conversões entre tipos de conteúdo

Objetos do tipo fator

Edição de fatores: rotulação e ordenação

Foi bem na sabatina?