#/////////////////////////////////////////////////////////////////////// # Pacotes -------------------------------------------------------------- # install.packages("tidyverse") library(tidyverse) # Links úteis. # browseURL("http://leg.ufpr.br/~walmes/ensino/dsbd-linprog/slides/02-r-tidyverse.html") # browseURL("http://leg.ufpr.br/~walmes/cursoR/data-vis/99-datasets.html") # browseURL("http://leg.ufpr.br/~walmes/data") #/////////////////////////////////////////////////////////////////////// # Criação dos dados ---------------------------------------------------- # Cria um tibble data.frame a partir de vetores. df1 <- tibble(matricula = c(256, 487, 965, 125, 458, 874, 963), nome = c("João", "Vanessa", "Tiago", "Luana", "Gisele", "Pedro", "André"), curso = c("Mat", "Mat", "Est", "Est", "Est", "Mat", "Est"), prova1 = c(80, 75, 95, 70, 45, 55, 30), prova2 = c(90, 75, 80, 85, 50, 75, NA), prova3 = c(80, 75, 75, 50, NA, 90, 30), faltas = c(4, 4, 0, 8, 16, 0, 20)) df2 <- tibble(matricula = c(505, 658, 713), nome = c("Bia", "Carlos", "Cris"), curso = c("Eng", "Eng", "Eng"), prova1 = c(65, 75, 75), prova2 = c(85, 80, 90), faltas = c(0, 0, 2)) df_extra <- tribble( ~mat, ~nome, ~idade, ~bolsista, 256, 'João' , 18, "S", 965, 'Tiago' , 18, "N", 285, 'Tiago' , 22, "N", 125, 'Luana' , 21, "S", 874, 'Pedro' , 19, "N", 321, 'Mia' , 18, "N", 669, 'Luana' , 19, "S", 967, 'André' , 20, "N", ) str(df1) glimpse(df1) glimpse(df2) glimpse(df_extra) #/////////////////////////////////////////////////////////////////////// # Ordenação ------------------------------------------------------------ df1 |> arrange(matricula) df1 |> arrange(nome) df1 |> arrange(curso, desc(nome)) # df1 <- df1 |> # arrange(curso, desc(nome)) # x <- scan() # x #/////////////////////////////////////////////////////////////////////// # Coluna, variável, atributo, campo, feature. # Linha, observação, casos, registro, instância, tupla. df1 |> select(nome, prova1, prova2) df1 |> select(-prova1, -prova2) df1 |> select(2, 5, 3) df1 |> select(starts_with("pro")) df1 |> select(ends_with("s")) df1 |> select(contains("va")) df1 |> select(matches("[[:digit:]]")) df1 |> select(-matches("[[:digit:]]")) df1 |> select_if(is.numeric) df1 |> select_if(is.character) df1 |> select_if(function(x) !is.character(x)) df1 |> select_if(negate(is.character)) df1 |> select_if(function(x) any(is.na(x))) df1 |> select_if(function(x) all(!is.na(x))) df1[["nome"]] df1[c("nome", "faltas")] df1[, c("nome", "faltas")] df1[, c(2, 5)] #/////////////////////////////////////////////////////////////////////// # Fatiar --------------------------------------------------------------- df1 |> slice(1:4) df1 |> slice(c(2, 5)) df1 |> slice(-c(2, 5)) df1 |> head(n = 3) df1 |> slice_head(n = 3) df1 |> slice_tail(n = 3) df1 |> slice_max(n = 3, order_by = prova1) df1 |> slice_min(n = 3, order_by = faltas) # df1 |> # arrange(faltas) |> # head(n = 3) df1[1:4, ] df1[c(2, 5), ] set.seed(123) df1 |> sample_n(3) 0.6 * nrow(df1) df1 |> sample_frac(0.6) #/////////////////////////////////////////////////////////////////////// # Filter --------------------------------------------------------------- # sample(c(TRUE, FALSE), size = nrow(df1), replace = TRUE) df1 |> filter(curso == "Mat") df1 |> filter(prova1 > 70) # OU. df1 |> filter(prova1 > 70 | prova2 > 70 | prova3 > 70) # AND. df1 |> filter(prova1 > 70 & prova2 > 70 & prova3 > 70) # AND. df1 |> filter(prova1 > 70, prova2 > 70, prova3 > 70) (df1$prova1 > 70) & (df1$prova2 > 70) # exists("y") && is.numeric(y) # exists("y") & is.numeric(y) # exists("df1") && print(df1) "Qual é a capital do Mato Grosso do Sul?" |> str_sub(-10) abbreviate("Qual é a capital do Mato Grosso do Sul?", 20) #/////////////////////////////////////////////////////////////////////// # Renomear ------------------------------------------------------------- ls("package:dplyr", pattern = "rename") df1 |> rename("mat" = "matricula", "nome_aluno" = "nome", "curso_aluno" = "curso") |> rename_with(function(x) str_replace(x, "prova", "avaliacao")) |> rename_with(toupper) df1 |> rename_if(is.character, toupper) df1 |> rename("Matrícula" = "matricula", "Nome Completo" = "nome", "Curso do Aluno" = "curso", "Acertos 1 (%)" = "prova1", "IMC (kg/m²)" = "prova3") |> janitor::clean_names() tibble::tribble( ~"Qual a capital do Mato Grosso do Sul?", "Campo Grande" ) |> rename(cidade = 1) #/////////////////////////////////////////////////////////////////////// # Modificar/tranformar ------------------------------------------------- df1 |> # drop_na() |> mutate(prova1 = replace_na(prova1, 0), prova2 = replace_na(prova2, 0), prova3 = replace_na(prova3, 0)) |> mutate(media = (prova1 + prova2 + prova3)/3) df1$media <- df1 |> mutate_at(vars(starts_with("prova")), ~replace_na(., 0)) |> select(starts_with("prova")) |> rowMeans(na.rm = TRUE) df1 |> mutate(condicao = case_when( media >= 70 ~ "Aprovado", media >= 40 ~ "Exame", TRUE ~ "Reprovado" )) cut(c(69, 70, 71), breaks = c(0, 40, 70, Inf), labels = c("R", "E", "A"), right = FALSE) df1 |> mutate(condicao = cut(media, breaks = c(0, 40, 70, Inf), labels = c("R", "E", "A"), right = TRUE)) df1 |> mutate(condicao = case_when( media >= 70 & faltas <= 15 ~ "Aprovado", media >= 40 & faltas <= 15 ~ "Exame", faltas > 15 ~ "R. faltas", # TRUE ~ "R. nota" .default = "Reprovado" )) df1 |> mutate_at(vars(starts_with("prova")), ~ifelse(. > 80, .*1.05, .)) df1 |> mutate(across(where(is.numeric) & starts_with("prova"), ~ifelse(. > 80, .*1.05, .))) #/////////////////////////////////////////////////////////////////////// # Resumir/agregar/consolidar ------------------------------------------- ls("package:dplyr", pattern = "summarise") df1 |> summarise(prova1_m = mean(prova1), prova1_s = sd(prova1), prova1_v = var(prova1), prova1_n = n(), prova1_max = max(prova1), prova1_min = min(prova1), prova1_cv = 100 * prova1_s/prova1_m, prova1_md = median(prova1), prova1_q1 = quantile(prova1, 0.25), prova1_q3 = quantile(prova1, 0.75), prova1_iqr = prova1_q3 - prova1_q1, prova1_mad = mad(prova1)) |> t() df1 |> select(-matricula, -media) |> summarise_if(is.numeric, c("m" = mean, "s" = sd), na.rm = TRUE) df1 |> select(-matricula, -media) |> mutate(across(where(is.numeric), ~replace_na(., 0))) |> summarise(across(is.numeric, list(m = mean, s = sd))) #/////////////////////////////////////////////////////////////////////// # Agrupar -------------------------------------------------------------- df1 |> group_by(curso) |> summarise(prova1_m = mean(prova1), prova1_s = sd(prova1)) x <- df1 |> group_by(curso, faltas > 10) |> select(starts_with("prova")) |> mutate_all(~replace_na(., 0)) |> summarise_all(c(m = mean, s = sd)) |> ungroup() class(x) x |> summarise(u = max(prova1_m)) df1 |> group_by(curso) |> summarise(prova1_m = mean(prova1), prova1_s = sd(prova1)) df1 |> # group_by(matricula) |> mutate(across(starts_with("prova"), ~replace_na(., 0))) |> rowwise() |> transmute(matricula, media_a = mean(c(prova1, prova2, prova3), na.rm = TRUE), media_b = mean(c_across(starts_with("prova")), na.rm = TRUE)) |> ungroup() #/////////////////////////////////////////////////////////////////////// # Pivotar -------------------------------------------------------------- ls("package:tidyr", pattern = "^pivot") df1_long <- df1 |> pivot_longer(cols = starts_with("prova"), names_to = "avaliacao", values_to = "nota") |> drop_na(nota) df1_long |> print(n = Inf) # df1 |> # gather(key = "avaliacao", # value = "nota", # starts_with("prova")) |> # print(n = Inf) df1_long |> pivot_wider(names_from = "avaliacao", values_from = "nota", values_fill = 0) # df1_long |> # spread(key = "avaliacao", # value = "nota", # fill = 0) #/////////////////////////////////////////////////////////////////////// # Concatenar ----------------------------------------------------------- ls("package:dplyr", pattern = "bind") df1 df2 df2 <- df2 |> mutate(across(starts_with("prova"), ~replace_na(., 0))) |> rowwise() |> mutate(media = mean(c_across(starts_with("prova")), na.rm = TRUE)) |> ungroup() bind_rows(df1, df2) df3 <- df1 |> select(-matricula) |> transmute(across(starts_with("prova"), log, .names = "{.col}_log")) bind_cols(df1, df3) df1 |> select(-matricula) |> mutate(across(starts_with("prova"), log, .names = "{.col}_log")) #/////////////////////////////////////////////////////////////////////// # Juntar/conciliar ----------------------------------------------------- ls("package:dplyr", pattern = "join") str(df1) str(df_extra) inner_join(df1, df_extra, by = c("matricula" = "mat", "nome")) full_join(df1, df_extra, by = c("matricula" = "mat", "nome")) left_join(df1, select(df_extra, -prova1), by = c("matricula" = "mat", "nome")) right_join(df1, df_extra, by = c("matricula" = "mat", "nome")) left_join(df_extra, df1, by = c("mat" = "matricula", "nome")) anti_join(df1, df_extra, by = c("matricula" = "mat", "nome")) semi_join(df1, df_extra, by = c("matricula" = "mat", "nome")) #----------------------------------------------------------------------- url <- "http://leg.ufpr.br/~walmes/data/aval_carros_nota.txt" # browseURL(url) # Leitura com `readr`. tb <- read_tsv(file = url, col_names = TRUE) class(tb) str(tb, give.attr = FALSE) tb_w <- pivot_wider(tb, names_from = "item", values_from = "nota") tb_w tb_w |> select(-dono, -carro) |> cor() #----------------------------------------------------------------------- url <- "http://leg.ufpr.br/~walmes/data/inmet-insolacao-total.txt" tb <- read.table(url, header = TRUE, comment = "#", sep = "\t", dec = ",", na.strings = "-", stringsAsFactors = FALSE) str(tb) tb |> count(UF) tb_l <- tb |> pivot_longer(cols = Janeiro:Dezembro, names_to = "mes", values_to = "insolacao") tb_l |> filter(UF == "PR") |> ggplot(aes(x = mes, y = insolacao, group = 1)) + facet_wrap(~Nome.da.Estação) + geom_line() #---------------------------------------------------------------------- # url <- "https://raw.githubusercontent.com/leg-ufpr/hackathon/master/notas.csv" # # url <- "~/Dropbox/web-scrap/hackathon/notas.csv" # # # tb <- read_csv2(url, col_names = FALSE) # tb <- read_csv2(url, col_names = c("id", "item", "nota")) # tb url <- "https://raw.githubusercontent.com/leg-ufpr/hackathon/master/opinioes.json" # url <- "/home/walmes/Dropbox/web-scrap/hackathon/opinions.json" js <- jsonlite::fromJSON(url) str(js) js <- as_tibble(js) str(js) js <- js %>% setNames(nm = c("id", "tit", "veic", "dono", "desc", "pros", "cont", "defe", "opin", "ts")) str(js) glimpse(js) js |> head(n = 8) |> select(dono) |> separate(col = dono, c("nome", "local"), sep = " - ", remove = FALSE) |> mutate(uf = sub(pattern = ".*(..)$", "\\1", local), municipio = sub(pattern = "(.*) ..$", "\\1", local)) js |> head(n = 8) |> transmute(km = sub(pattern = ".*- (.*) km.*", replacement = "\\1", x = desc), km = as.numeric(km), km = 1000 * km, anterior = sub(pattern = ".*: (.*)$", replacement = "\\1", x = desc), periodo = sub(pattern = "(.*) -.*", replacement = "\\1", x = desc), periodo = gsub("\\D+", "", periodo), periodo = as.integer(periodo)) |> unite(col = "nova", anterior, periodo, km, sep = " | ") js$desc[1] x <- "Dono há 6 anos - 35.200 kmCarro anterior: Fiat Uno" gsub(pattern = ".*(\\d) anos.*- (.*) km.*: (.*)$", replacement = "\\2 km, \\1 anos, anteriorme \\3", x = x) tb <- tribble( ~ano, ~local, ~idh, 2010, "MS", 0.7, 2010, "PR", 0.8, 2010, "SP", 0.9, 2011, "MS", 0.71, 2011, "PR", 0.81, 2012, "MS", 0.72, 2012, "PR", 0.82, 2013, "SP", 0.91 ) tb |> expand(ano, local) tb |> complete(ano, local) tb |> complete(ano, local) |> group_by(local) |> fill(idh, .direction = "down") #---------------------------------------------------------------------- tb |> complete(ano, local) |> group_split(local) tb |> nest(data = c(ano, idh)) tb |> nest(data = c(ano, idh)) |> unnest(data) tb |> filter(local == "MS") |> lm(idh ~ ano, data = _) |> coef() |> purrr::pluck("ano") tb |> nest(data = c(ano, idh)) |> mutate(model = map_chr(data, function(x) { b <- lm(idh ~ ano, data = x) |> coef() ifelse(b > 0, "sim", "não") })) # REGEX: {stringr}. # Datas: {lubridate}. # Fatores: {forcats}.