#----------------------------------------------------------------------- # R para Data Science # # Prof. Dr. Walmes M. Zeviani & Prof. Dr. Wagner H. Bonat # Department of Statistics · Federal University of Paraná # 2021-mai-11 · Curitiba/PR/Brazil #----------------------------------------------------------------------- #----------------------------------------------------------------------- # Pacotes. library(tidyverse) #----------------------------------------------------------------------- # Importanto dados. url_csv <- "https://raw.githubusercontent.com/leg-ufpr/hackathon/master/notas.csv" tb_nt <- read.csv2(url_csv) str(tb_nt) url_json <- "https://raw.githubusercontent.com/leg-ufpr/hackathon/master/opinioes.json" tb_op <- jsonlite::read_json(url_json, simplifyVector = TRUE) tb_op <- as.data.frame(tb_op) str(tb_op) names(tb_op) <- c("ID", "title", "model", "usercity", "usage", "pro", "con", "problem", "opinion", "ts") tb_nt <- tb_nt %>% pivot_wider(id_cols = "ID", names_from = "quesito", values_from = "nota") str(tb_nt) tb <- inner_join(tb_nt, tb_op[, c("ID", "model", "usercity", "usage", "ts")], by = "ID") str(tb) #----------------------------------------------------------------------- # Tratamento com regex para extração. tb$location <- tb$usercity %>% str_remove("^.*- ") tb$uf <- tb$location %>% str_sub(start = -2) tb$year <- tb$model %>% str_extract("\\d{4}/\\d{4}") %>% str_sub(end = 4) %>% as.integer() tb$spec <- tb$model %>% str_remove(" \\d{4}/\\d{4}") # Tempo que possui o veículo. tb$period <- tb$usage %>% str_remove("menos de") %>% str_replace(".*há +(\\d+) +ano.*", "\\1") %>% as.integer() # Último veículo. tb$lastcar <- ifelse(str_detect(tb$usage, "Carro anterior:"), str_replace(tb$usage, ".*Carro anterior: (.*)$", "\\1"), NA_character_) # Percurso. tb$km <- ifelse(str_detect(tb$usage, "\\d+ km"), str_replace(tb$usage, ".* ([0-9.]+) km.*", "\\1"), NA_character_) %>% str_remove("\\.") %>% as.integer() # Data da avaliação. tb$ts <- tb$ts %>% as.POSIXct(format = "%d/%m/%Y %H:%M:%S") # Fabricante. tb$brand <- tb$spec %>% str_replace("^(\\w+) .*", "\\1") tb$brand_last <- tb$lastcar %>% str_replace("^(\\w+) .*", "\\1") tb$model <- tb$spec %>% str_replace("^\\w+ (\\w+) .*", "\\1") str(tb) #----------------------------------------------------------------------- # Análise. tb %>% count(brand) tb %>% count(brand, model) tb %>% count(brand_last, sort = TRUE) tb <- tb %>% mutate(brand_last = fct_lump_n(brand_last, n = 4)) tb %>% filter(brand == "Chevrolet") %>% ggplot(data = ., mapping = aes(y = brand_last)) + geom_bar() #----------------------------------------------------------------------- # Média das variáveis avaliadas por fabricante. tb_agg <- tb %>% group_by(brand, model) %>% do({ bind_cols( summarise_at(., vars(Estilo:Recomendação), "mean"), summarise(., n = n())) }) %>% ungroup() tb_agg #----------------------------------------------------------------------- # Mapas. library(geobr) states <- read_state(year = 2019) # saveRDS(states, file = "geobr_states.rds") # states <- readRDS(file = "geobr_states.rds") class(states) tb_count <- tb %>% count(uf) print(tb_count, n = Inf) names(tb) tb_count <- tb %>% filter(brand == "Volkswagen") %>% group_by(uf) %>% summarise(n = n(), y = mean(Recomendação)) # join the databases states <- dplyr::left_join(states, tb_count, by = c("abbrev_state" = "uf")) names(states) ggplot() + geom_sf(data = states, mapping = aes(fill = y), color = "black", size = .15) + scale_fill_distiller(palette = "Blues", direction = 1, name = "Nota") + theme_minimal() #----------------------------------------------------------------------- # Uma pequena interface shiny. library(esquisse) esquisser(iris) ggplot(iris) + aes(x = Petal.Width, y = Petal.Length) + geom_point(size = 4.3, colour = "#112446") + labs( x = "Largura da sépala (mm)", y = "Comprimento da pétala (mm)", title = "Dados do dataset iris", subtitle = "Diagrama dispersão", caption = "Feito com amor usando o R" ) + theme_gray() + facet_wrap(vars(Species)) str(states) esquisser(states) esquisser(iris) library(DataExplorer) library(skimr) library(vis_dat) #-----------------------------------------------------------------------