|
Linguagens de Programação para Ciência de Dados |
Links úteis:
#-----------------------------------------------------------------------
# Pacotes.
# Para fazer ensaios de performance.
library(microbenchmark)
ls("package:microbenchmark")
# help(microbenchmark, h = "html")
# Troca as opções default da função.
formals(microbenchmark)$times <- 10
# Carrega pacote, exibe versão e funções/objetos públicos.
library(data.table)
packageVersion("data.table")
ls("package:data.table")
library(tidyverse)
# ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
# ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
# ✔ tibble 1.4.2 ✔ dplyr 0.7.4
# ✔ tidyr 0.8.0 ✔ stringr 1.3.0
# ✔ readr 1.1.1 ✔ forcats 0.3.0
# ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
# ✖ dplyr::between() masks data.table::between()
# ✖ dplyr::filter() masks stats::filter()
# ✖ dplyr::first() masks data.table::first()
# ✖ dplyr::lag() masks stats::lag()
# ✖ dplyr::last() masks data.table::last()
# ✖ purrr::transpose() masks data.table::transpose()
Serão usados vários conjuntos de dados nesse tutorial comparativo. O primeiro deles TODO
#-----------------------------------------------------------------------
# Carregando um conjunto de dados do repositório de ML.
# browseURL("http://archive.ics.uci.edu/ml/datasets/Bank+Marketing")
# URL do arquivo.
u <- "http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip"
if (!file.exists(basename(u))) {
download.file(u, destfile = basename(u))
utils::unzip(zipfile = basename(u))
}
# Coleção de arquivos.
dir(pattern = "^bank")
# [1] "bank-full.csv" "bank-names.txt" "bank.csv" "bank.zip"
system("wc -l bank-full.csv") # Conta o número de linhas.
system("head -n 3 bank-full.csv") # Mostra o topo do arquivo.
system("file -bi bank-full.csv") # Exibe o mimetype e encoding.
# Carregando o arquivo com utils::read.csv2().
da_bs <- read.csv2(file = "bank-full.csv",
stringsAsFactors = FALSE)
str(da_bs)
# 'data.frame': 45211 obs. of 17 variables:
# $ age : int 58 44 33 47 33 35 28 42 58 43 ...
# $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
# $ marital : chr "married" "single" "married" "married" ...
# $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
# $ default : chr "no" "no" "no" "no" ...
# $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
# $ housing : chr "yes" "yes" "yes" "yes" ...
# $ loan : chr "no" "no" "yes" "no" ...
# $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
# $ day : int 5 5 5 5 5 5 5 5 5 5 ...
# $ month : chr "may" "may" "may" "may" ...
# $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
# $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
# $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
# $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
# $ y : chr "no" "no" "no" "no" ...
# Lendo com a data.table::fread().
da_dt <- fread(file = "bank-full.csv",
header = TRUE,
sep = ";")
str(da_dt)
# Classes 'data.table' and 'data.frame': 45211 obs. of 17 variables:
# $ age : int 58 44 33 47 33 35 28 42 58 43 ...
# $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
# $ marital : chr "married" "single" "married" "married" ...
# $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
# $ default : chr "no" "no" "no" "no" ...
# $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
# $ housing : chr "yes" "yes" "yes" "yes" ...
# $ loan : chr "no" "no" "yes" "no" ...
# $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
# $ day : int 5 5 5 5 5 5 5 5 5 5 ...
# $ month : chr "may" "may" "may" "may" ...
# $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
# $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
# $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
# $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
# $ y : chr "no" "no" "no" "no" ...
# - attr(*, ".internal.selfref")=<externalptr>
# Lendo com a readr::read_csv().
da_tv <- read_csv2(file = "bank-full.csv")
# Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
# Parsed with column specification:
# cols(
# age = col_integer(),
# job = col_character(),
# marital = col_character(),
# education = col_character(),
# default = col_character(),
# balance = col_integer(),
# housing = col_character(),
# loan = col_character(),
# contact = col_character(),
# day = col_integer(),
# month = col_character(),
# duration = col_integer(),
# campaign = col_integer(),
# pdays = col_integer(),
# previous = col_integer(),
# poutcome = col_character(),
# y = col_character()
# )
str(da_tv, give.attr = FALSE)
# Classes 'tbl_df', 'tbl' and 'data.frame': 45211 obs. of 17 variables:
# $ age : int 58 44 33 47 33 35 28 42 58 43 ...
# $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
# $ marital : chr "married" "single" "married" "married" ...
# $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
# $ default : chr "no" "no" "no" "no" ...
# $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
# $ housing : chr "yes" "yes" "yes" "yes" ...
# $ loan : chr "no" "no" "yes" "no" ...
# $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
# $ day : int 5 5 5 5 5 5 5 5 5 5 ...
# $ month : chr "may" "may" "may" "may" ...
# $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
# $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
# $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
# $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
# $ y : chr "no" "no" "no" "no" ...
res <- microbenchmark(
"R básico" = {
read.csv2(file = "bank-full.csv",
stringsAsFactors = FALSE)
},
"DT" = {
fread(file = "bank-full.csv",
header = TRUE,
sep = ";")
},
"TV" = {
suppressMessages(read_csv2(file = "bank-full.csv"))
},
times = 25)
res
# Unit: milliseconds
# expr min lq mean median uq max
# R básico 339.78571 342.41909 357.96177 343.29685 348.81434 414.36601
# DT 42.66718 42.84687 44.27542 43.31551 45.27545 50.76589
# TV 80.72948 84.16913 103.27553 85.99513 88.50011 217.87106
# neval cld
# 25 c
# 25 a
# 25 b
boxplot(res)
da_bs <- da_bs[order(da_bs$age), ]
da_bs <- da_bs[order(da_bs$marital, da_bs$age, decreasing = TRUE), ]
rbind(head(da_bs), tail(da_bs))
# age job marital education default balance housing loan
# 42461 86 retired single secondary no 614 no no
# 31052 83 retired single primary no 3349 no no
# 41790 83 retired single primary no 1965 no no
# 41523 77 retired single primary no 300 no no
# 41424 73 retired single secondary no 1050 no no
# 43214 73 retired single secondary no 1050 no no
# 6935 25 blue-collar divorced secondary no 2428 yes no
# 8960 25 blue-collar divorced secondary no 720 yes no
# 13287 25 technician divorced secondary no 86 no yes
# 35585 25 technician divorced tertiary no 2317 yes no
# 40518 25 services divorced secondary no 1694 no no
# 38567 24 blue-collar divorced secondary no 513 yes no
# contact day month duration campaign pdays previous poutcome y
# 42461 telephone 9 dec 595 1 -1 0 unknown yes
# 31052 telephone 12 feb 89 1 -1 0 unknown no
# 41790 telephone 13 oct 1003 3 -1 0 unknown yes
# 41523 cellular 9 sep 511 1 -1 0 unknown no
# 41424 cellular 4 sep 73 2 -1 0 unknown no
# 43214 cellular 4 mar 562 3 181 2 failure yes
# 6935 unknown 28 may 376 4 -1 0 unknown no
# 8960 unknown 4 jun 156 1 -1 0 unknown no
# 13287 cellular 8 jul 483 2 -1 0 unknown no
# 35585 cellular 7 may 273 4 -1 0 unknown no
# 40518 cellular 7 jul 159 2 -1 0 unknown no
# 38567 cellular 15 may 61 3 -1 0 unknown no
da_dt <- da_dt[order(age)]
da_dt <- da_dt[order(marital, age, decreasing = TRUE)]
setorder(da_dt, marital, -age)
setorderv(da_dt, cols = c("marital", "age"), order = c(1, -1))
da_dt
# age job marital education default balance housing loan
# 1: 95 retired divorced primary no 2282 no no
# 2: 94 retired divorced secondary no 1234 no no
# 3: 90 retired divorced secondary no 1 no no
# 4: 90 retired divorced primary no 712 no no
# 5: 89 retired divorced primary no 1323 no no
# ---
# 45207: 18 student single secondary no 156 no no
# 45208: 18 student single primary no 608 no no
# 45209: 18 student single unknown no 108 no no
# 45210: 18 student single unknown no 348 no no
# 45211: 18 student single unknown no 438 no no
# contact day month duration campaign pdays previous poutcome y
# 1: telephone 21 apr 207 17 -1 0 unknown yes
# 2: cellular 3 mar 212 1 -1 0 unknown no
# 3: cellular 13 feb 152 3 -1 0 unknown yes
# 4: telephone 3 mar 557 1 -1 0 unknown yes
# 5: telephone 29 dec 207 4 189 1 other no
# ---
# 45207: cellular 4 nov 298 2 82 4 other no
# 45208: cellular 13 nov 210 1 93 1 success yes
# 45209: cellular 9 feb 92 1 183 1 success yes
# 45210: cellular 5 may 443 4 -1 0 unknown yes
# 45211: cellular 1 sep 425 1 -1 0 unknown no
da_tv <- arrange(da_tv, age)
da_tv <- arrange(da_tv, marital, -age)
da_tv
# # A tibble: 45,211 x 17
# age job marital education default balance housing loan contact
# <int> <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr>
# 1 95 retired divorced primary no 2282 no no telepho…
# 2 94 retired divorced secondary no 1234 no no cellular
# 3 90 retired divorced secondary no 1 no no cellular
# 4 90 retired divorced primary no 712 no no telepho…
# 5 89 retired divorced primary no 1323 no no telepho…
# 6 87 retired divorced primary no 6746 no no telepho…
# 7 86 retired divorced primary no 0 no no telepho…
# 8 86 retired divorced unknown no 157 no no telepho…
# 9 85 retired divorced primary no 7613 no no cellular
# 10 84 retired divorced primary no 2619 no no telepho…
# # ... with 45,201 more rows, and 8 more variables: day <int>, month <chr>,
# # duration <int>, campaign <int>, pdays <int>, previous <int>,
# # poutcome <chr>, y <chr>
res <- microbenchmark(
"R básico" = {
x <- da_bs[order(da_bs$marital,
sample(c(-1, 1), size = 1) * da_bs$age), ]
},
"DT" = {
setorderv(da_dt,
cols = c("marital", "age"),
order = c(1, sample(c(-1, 1), size = 1)))
},
"TV" = {
x <- arrange(da_tv,
marital,
sample(c(-1, 1), size = 1) * age)
},
times = 50)
res
# Unit: microseconds
# expr min lq mean median uq max
# R básico 28958.759 30173.528 36671.508 31845.906 34067.027 144750.894
# DT 377.261 503.664 1107.243 644.716 1644.134 4010.612
# TV 16847.288 17765.189 28290.847 22451.878 23673.739 138286.251
# neval cld
# 50 b
# 50 a
# 50 b
boxplot(res)
tb <- da_bs[da_bs$age > 70, ]
tb <- da_bs[da_bs$age > 50 & da_bs$marital == "divorced", ]
tb <- da_bs[da_bs$balance >= 1000 & da_bs$balance <= 2000, ]
tb <- subset(da_bs, age > 70)
tb <- subset(da_bs, age > 50 & marital == "divorced")
tb <- subset(da_bs, balance >= 1000 & balance <= 2000)
tb <- da_dt[age > 70]
tb <- da_dt[age > 50 & marital == "divorced"]
tb <- da_dt[balance >= 1000 & balance <= 2000]
tb <- da_dt[data.table::between(balance, lower = 1000, upper = 2000)]
tb <- subset(da_bs, age > 70)
tb <- subset(da_bs, age > 50 & marital == "divorced")
tb <- subset(da_bs, balance >= 1000 & balance <= 2000)
tb <- subset(da_bs, data.table::between(balance, lower = 1000, upper = 2000))
tb <- filter(da_tv, age > 70)
tb <- filter(da_tv, age > 50, marital == "divorced")
tb <- filter(da_tv, balance >= 1000 & balance <= 2000)
tb <- filter(da_tv, dplyr::between(balance, left = 1000, right = 2000))
u <- unique(da_bs$marital) # Valores para estado civil.
x <- range(da_bs$age) # Domínio dos valores de idade.
res <- microbenchmark(
"R básico []" = {
xi <- floor(runif(n = 1, min(x), max(x)))
ui <- sample(u, size = 1)
da <- da_bs[da_bs$age > xi & da_bs$marital == ui, ]
},
"DT []" = {
xi <- floor(runif(n = 1, min(x), max(x)))
ui <- sample(u, size = 1)
da <- da_dt[age > xi & marital == ui]
},
"R básico subset"= {
xi <- floor(runif(n = 1, min(x), max(x)))
ui <- sample(u, size = 1)
da <- subset(da_bs, age > xi & marital == ui)
},
"DT subset" = {
xi <- floor(runif(n = 1, min(x), max(x)))
ui <- sample(u, size = 1)
da <- subset(da_dt, age > xi & marital == ui)
},
"TV" = {
xi <- floor(runif(n = 1, min(x), max(x)))
ui <- sample(u, size = 1)
da <- filter(da_tv, age > xi, marital == ui)
},
times = 200)
res
# Unit: microseconds
# expr min lq mean median uq max
# R básico [] 1166.881 1288.116 3252.470 1693.754 3466.065 120580.637
# DT [] 963.358 1116.693 1692.778 1261.964 1725.922 6195.623
# R básico subset 1383.501 1536.329 3335.285 2077.280 3700.675 117878.055
# DT subset 1216.881 1361.689 1961.758 1582.941 2240.361 5919.951
# TV 1444.103 1713.945 2331.252 1927.048 2531.082 6175.410
# neval cld
# 200 b
# 200 a
# 200 b
# 200 ab
# 200 ab
boxplot(res)
tb <- da_bs[, c("age", "marital", "education")]
tb <- da_dt[, c(1:4, 7, 10:12)]
tb <- da_dt[, -c(1:4, 7, 10:12)]
tb <- Filter(f = is.numeric, x = da_bs)
tb <- subset(da_bs, select = c("age", "marital", "education"))
tb <- subset(da_bs, select = c(age, marital, education))
tb <- subset(da_bs, select = -c(age, marital, education))
tb <- subset(da_bs, select = c(1:4, 7, 10:12))
tb <- subset(da_bs, select = -c(1:4, 7, 10:12))
tb <- da_dt[, c("age", "marital", "education")]
tb <- da_dt[, -c("age", "marital", "education")]
tb <- da_dt[, !c("age", "marital", "education")]
tb <- da_dt[, list(age, marital, education)]
tb <- da_dt[, c(1:4, 7, 10:12)]
tb <- da_dt[, -c(1:4, 7, 10:12)]
tb <- Filter(f = is.numeric, x = da_dt)
tb <- da_dt[, Filter(f = is.numeric, x = .SD)]
tb <- subset(da_dt, select = c("age", "marital", "education"))
tb <- subset(da_dt, select = c(age, marital, education))
tb <- subset(da_dt, select = -c(age, marital, education))
tb <- subset(da_dt, select = c(1:4, 7, 10:12))
tb <- subset(da_dt, select = -c(1:4, 7, 10:12))
tb <- select(da_tv, age, marital, education)
tb <- select(da_tv, -age, -marital, -education)
tb <- select(da_tv, c(age, marital, education))
tb <- select(da_tv, -c(age, marital, education))
tb <- select(da_tv, c("age", "marital", "education"))
tb <- select(da_tv, c(1:4, 7, 10:12))
tb <- select(da_tv, -c(1:4, 7, 10:12))
tb <- select(da_tv, -c(1:4, 7, 10:12))
tb <- select_if(da_tv, .predicate = is.numeric)
v <- names(da_bs)
res <- microbenchmark(
"R básico []" = da_bs[, sample(v, size = 5)],
"DT []" = da_dt[, sample(v, size = 5)],
"R básico subset" = subset(da_bs, select = sample(v, size = 5)),
"DT subset" = subset(da_dt, select = sample(v, size = 5)),
"TV" = select(da_dt, sample(v, size = 5)),
times = 300)
res
# Unit: microseconds
# expr min lq mean median uq
# R básico [] 29.373 41.8595 52.08615 52.742 59.7010
# DT [] 209.894 246.9415 308.40203 284.998 363.3570
# R básico subset 6758.479 7311.2030 8220.14884 7516.890 7797.3650
# DT subset 694.682 844.3180 1017.50973 907.257 997.1915
# TV 810.756 896.7330 1026.05590 1047.047 1093.7465
# max neval cld
# 148.077 300 a
# 568.975 300 a
# 122970.104 300 c
# 3465.556 300 b
# 1785.854 300 b
boxplot(res)
tb <- da_bs[, 1:17]
tb$x <- log(tb$age)
tb$y <- tb$education %in% c("tertiary", "secondary")
str(tb)
tb <- da_bs[, 1:17]
tb <- transform(tb,
x = log(age),
y = education %in% c("tertiary", "secondary"))
str(tb)
tb <- da_bs[, 1:17]
tb <- within(tb, {
x <- log(age)
y <- education %in% c("tertiary", "secondary")
})
str(tb)
tb <- da_dt[, 1:17]
tb$x <- log(tb$age)
tb$y <- tb$education %in% c("tertiary", "secondary")
str(tb)
tb <- da_dt[, 1:17]
tb <- transform(tb,
x = log(age),
y = education %in% c("tertiary", "secondary"))
str(tb)
tb <- da_dt[, 1:17]
tb <- within(tb, {
x <- log(age)
y <- education %in% c("tertiary", "secondary")
})
str(tb)
tb <- da_dt[, 1:17]
tb[, x := log(age)]
tb[, y := education %in% c("tertiary", "secondary")]
str(tb)
tb <- da_dt[, 1:17]
tb[, c("x", "y") := list(log(age),
education %in% c("tertiary", "secondary"))]
str(tb)
tb <- mutate(da_tv,
x = log(age),
y = education %in% c("tertiary", "secondary"))
str(tb)
tb <- aggregate(age ~ marital,
data = da_bs,
FUN = mean,
na.rm = TRUE)
tb
tb <- aggregate(cbind(age, balance) ~ marital,
data = da_bs,
FUN = mean,
na.rm = TRUE)
tb
tb <- aggregate(cbind(age, balance) ~ marital + education,
data = da_bs,
FUN = mean,
na.rm = TRUE)
tb
tb <- da_dt[, mean(age), by = marital]
tb <- da_dt[, list("age" = mean(age)), by = marital]
tb
tb <- da_dt[, list("age" = mean(age)),
by = list(marital, education)]
tb
tb <- da_dt[, list(age = mean(age),
balance = mean(balance)),
by = list(marital, education)]
tb
tb <- da_dt[, lapply(.SD, FUN = mean),
by = list(marital, education),
.SDcols = c("age", "balance")]
tb
tb <- da_dt[, lapply(.SD, FUN = mean),
by = list(marital, education),
.SDcols = c(1, 6)]
tb
tb <- da_dt[, lapply(.SD, FUN = mean),
by = list(marital, education),
.SDcols = which(sapply(da_dt, FUN = is.numeric))]
tb
tb <- summarize(group_by(da_tv, marital), mean(age))
tb <- summarize(group_by(da_tv, marital), age = mean(age))
tb
tb <- summarize(group_by(da_tv, marital, education),
age = mean(age))
tb
tb <- summarize(group_by(da_tv, marital, education),
age = mean(age),
balance = mean(balance))
tb
tb <- summarize_if(group_by(da_tv, marital, education),
.predicate = is.numeric,
.funs = mean)
tb
res <- microbenchmark(
"R básico" = {
tb <- aggregate(cbind(age, balance) ~ marital + education,
data = da_bs,
FUN = mean)
},
"DT" = {
tb <- da_dt[, list(age = mean(age),
balance = mean(balance)),
by = list(marital, education)]
},
"TV" = {
tb <- summarize(group_by(da_tv, marital, education),
age = mean(age),
balance = mean(balance))
},
times = 100)
res
# Unit: milliseconds
# expr min lq mean median uq max
# R básico 41.840317 45.055345 49.315869 45.965036 46.634020 161.083126
# DT 1.427484 1.486467 1.678471 1.685643 1.714068 4.067303
# TV 3.638447 3.851691 4.418660 3.975860 4.110340 26.002645
# neval cld
# 100 b
# 100 a
# 100 a
boxplot(res)
tb <- aggregate(cbind(age, balance) ~ marital,
data = da_bs,
FUN = function(x) {
c(m = mean(x),
s = sd(x),
n = length(x))
})
tb
tb <- da_dt[, list(m = mean(age),
s = sd(age),
n = length(age)),
by = marital]
tb
da_dt <- da_dt[, 1:17]
str(da_dt)
tb <- da_dt[, c(lapply(.SD, FUN = mean),
lapply(.SD, FUN = sd),
lapply(.SD, FUN = length)),
by = marital,
.SDcols = c("age", "balance")]
tb
# Com sulfixo da estatística calculada.
tb <- da_dt[,
as.list(
unlist(
lapply(X = .SD,
FUN = function(x) {
list(m = mean(x),
s = sd(x),
n = length(x))
})
)
),
by = marital,
.SDcols = c("age", "balance")]
tb
tb <- summarize_at(group_by(da_tv, marital),
.vars = c("age", "balance"),
.funs = c(m = "mean", s = "sd", n = "length"))
tb
res <- microbenchmark(
"R básico" = {
tb <- aggregate(cbind(age, balance) ~ marital,
data = da_bs,
FUN = function(x) {
c(m = mean(x),
s = sd(x),
n = length(x))
})
},
"DT" = {
tb <- da_dt[,
as.list(
unlist(
lapply(X = .SD,
FUN = function(x) {
list(m = mean(x),
s = sd(x),
n = length(x))
})
)
),
by = marital,
.SDcols = c("age", "balance")]
},
"TV" = {
tb <- summarize_at(group_by(da_tv, marital),
.vars = c("age", "balance"),
.funs = c(m = "mean",
s = "sd",
n = "length"))
},
times = 100)
res
# Unit: milliseconds
# expr min lq mean median uq max
# R básico 34.737042 37.279377 38.600736 37.751631 37.996814 151.358636
# DT 1.729073 1.859887 2.085782 1.965153 2.066161 4.156473
# TV 5.982416 6.235895 8.182480 6.369518 6.763842 121.179770
# neval cld
# 100 c
# 100 a
# 100 b
boxplot(res)
tb <- aggregate(duration ~ education + marital + job + housing,
data = da_bs,
FUN = mean)
head(tb)
# De long para wide.
tb <- reshape2::dcast(data = tb,
formula = education + job + housing ~ marital,
value.var = "duration")
str(tb)
# De wide para long.
tb <- reshape2::melt(data = tb,
id.vars = 1:3)
tb <- da_dt[, list(duration = mean(duration)),
by = list(education, marital, job, housing)]
tb
# De long para wide.
tb <- data.table::dcast(data = tb,
formula = education + job + housing ~ marital,
value.var = "duration")
str(tb)
# De wide para long.
tb <- data.table::melt(data = tb,
id.vars = 1:3)
tb <- summarize(group_by(da_tv, education, marital, job, housing),
duration = mean(duration))
tb
# De long para wide.
tb <- spread(data = tb,
key = "marital",
value = "duration")
tb
# De wide para long.
tb <- gather(data = tb,
4:6,
key = "marital",
value = "duration")
tb
tb_bs <- aggregate(duration ~ education + marital + job + housing,
data = da_bs,
FUN = mean)
tb_dt <- da_dt[, list(duration = mean(duration)),
by = list(education, marital, job, housing)]
tb_tv <- summarize(group_by(da_tv, education, marital, job, housing),
duration = mean(duration))
res <- microbenchmark(
"R básico" = {
a <- reshape2::dcast(data = tb_bs,
formula = education + job + housing ~ marital,
value.var = "duration")
b <- reshape2::melt(data = a, id.vars = 1:3)
},
"DT" = {
a <- data.table::dcast(data = tb_dt,
formula = education + job + housing ~ marital,
value.var = "duration")
b <- data.table::melt(data = a, id.vars = 1:3)
},
"TV" = {
a <- spread(data = tb_tv, key = "marital", value = "duration")
b <- gather(data = a, 4:6, key = "marital", value = "duration")
},
times = 300)
res
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# R básico 2.084531 2.182899 2.760189 2.340662 2.436483 83.80884 300 a
# DT 2.419919 2.494823 2.702049 2.644840 2.716862 6.16838 300 a
# TV 6.296591 6.594716 7.297693 6.783788 6.961580 70.07708 300 b
boxplot(res)
da_bs <- da_bs[, 1:17]
da_bs$id <- seq_len(nrow(da_bs))
v <- sample(seq_len(ncol(da_bs) - 1), size = 10)
tb1 <- subset(da_bs, select = c(ncol(da_bs), v))
tb2 <- subset(da_bs, select = c(-v))
tb2 <- tb2[sample(seq_len(nrow(da_bs)),
size = floor(nrow(da_bs) * 0.7)), ]
# Inner join.
tb <- merge(tb1, tb2)
str(tb)
da_dt <- da_dt[, 1:17]
da_dt$id <- seq_len(nrow(da_dt))
v <- sample(seq_len(ncol(da_dt) - 1), size = 10)
tb1 <- subset(da_dt, select = c(ncol(da_dt), v))
tb2 <- subset(da_dt, select = c(-v))
tb2 <- tb2[sample(seq_len(nrow(da_dt)),
size = floor(nrow(da_dt) * 0.7)), ]
# Inner join.
tb <- merge(tb1, tb2)
str(tb)
# Inner join.
tb <- tb1[tb2, nomatch = 0L, on = "id"]
str(tb)
setkey(tb1, id)
setkey(tb2, id)
tb <- tb1[tb2, nomatch = 0L]
str(tb)
da_tv <- da_tv[, 1:17]
da_tv$id <- seq_len(nrow(da_tv))
v <- sample(seq_len(ncol(da_tv) - 1), size = 10)
tb1 <- select(da_tv, c(ncol(da_bs), v))
tb2 <- select(da_tv, c(-v))
tb2 <- tb2[sample(seq_len(nrow(da_tv)),
size = floor(nrow(da_tv) * 0.7)), ]
tb <- inner_join(tb1, tb2)
str(tb)
tb <- da_dt[, 1:17]
tb <- rbind(tb, tb, tb, tb, tb, tb)
dim(tb)
# [1] 271266 17
tb$id <- seq_len(nrow(tb))
v <- sample(seq_len(ncol(tb) - 1), size = 10)
tb1_bs <- subset(tb, select = c(ncol(tb), v))
tb2_bs <- subset(tb, select = c(-v))
tb2_bs <- tb2_bs[sample(seq_len(nrow(tb)),
size = floor(nrow(tb) * 0.7)), ]
tb1_dt <- as.data.table(tb1_bs)
tb2_dt <- as.data.table(tb2_bs)
setkey(tb1_dt, id)
setkey(tb2_dt, id)
tb1_tv <- as_tibble(tb1_bs)
tb2_tv <- as_tibble(tb2_bs)
# c(nrow(tb1_bs), nrow(tb1_dt), nrow(tb1_tv))
# c(nrow(tb2_bs), nrow(tb2_dt), nrow(tb2_tv))
# c(ncol(tb1_bs), ncol(tb1_dt), ncol(tb1_tv))
# c(ncol(tb2_bs), ncol(tb2_dt), ncol(tb2_tv))
res <- microbenchmark(
"R básico" = {
tb <- merge(tb1_bs, tb2_bs)
},
"DT" = {
# tb <- merge(tb1_dt, tb2_dt)
tb <- tb1_dt[tb2_dt, nomatch = 0L]
},
"TV" = {
tb <- suppressMessages(inner_join(tb1_tv, tb2_tv))
},
times = 100)
res
# Unit: milliseconds
# expr min lq mean median uq max neval
# R básico 53.62105 54.72223 77.15435 60.59499 68.30228 204.5785 100
# DT 24.78764 25.55383 43.93059 26.47648 34.43317 170.0658 100
# TV 152.16594 160.29273 176.57559 162.63240 170.25771 312.0249 100
# cld
# b
# a
# c
boxplot(res)
Linguagens de programação para Ciência de Dados |
Prof. Fernando P. Mayer Prof. Walmes M. Zeviani Prof. Elias T;. Krainski |