综合练习
数据分析实战演练
应用篇“表达我自己比被人喜欢更重要。” 加油
尽可能的在tidyverse的框架下完成
library(tidyverse)
day01
旋转数据框,要求
d <- tibble::tribble(
~name, ~chinese, ~math, ~physics, ~english, ~music, ~sport,
"Alice", 88L, 63L, 98L, 89L, 85L, 72L,
"Bob", 85L, 75L, 85L, 82L, 73L, 83L,
"Carlo", 95L, 98L, 75L, 75L, 68L, 84L
)
d
变成
day02
排序,要求按照score从大往小排,但希望all是最下面一行。
d <-
tibble::tribble(
~name, ~score,
"a1", 2,
"a2", 5,
"a3", 3,
"a4", 7,
"a5", 6,
"all", 23
)
变成
day03
统计每位同学,成绩高于各科均值的个数,
d <- tibble::tribble(
~name, ~chinese, ~engish, ~physics, ~sport, ~music,
"Aice", 85, 56, 56, 54, 78,
"Bob", 75, 78, 77, 56, 69,
"Cake", 69, 41, 88, 89, 59,
"Dave", 90, 66, 74, 82, 60,
"Eve", 68, 85, 75, 69, 21,
"Fod", 77, 74, 62, 74, 88,
"Gimme", 56, 88, 75, 69, 34
)
d
变成
day04
data <- tribble(
~id, ~corr, ~period,
1, 0, "a",
1, 0, "b",
2, 0, "a",
2, 1, "b",
3, 1, "a",
3, 0, "b",
4, 1, "a",
4, 1, "b"
)
data
先按id分组,
- 如果corr中都是0 就"none"
- 如果corr中都是1 就"both"
- 如果corr中只有一个1,就输出1对应period
day05
图中柱子上的字体没有显示完整,请改进。
d <- tibble::tribble(
~name, ~value,
"Alice", 2.12,
"Bob", 68.45,
"Carlie", 15.84,
"Dave", 7.38,
"Eve", 0.56
)
d %>%
ggplot(aes(x = value, y = fct_reorder(name, value)) ) +
geom_col(width = 0.6, fill = "gray60") +
geom_text(aes(label = value, hjust =1)) +
theme_classic() +
scale_x_continuous(expand = c(0, 0)) +
labs(x = NULL, y = NULL)
day06
我看到新闻有一张图很漂亮,您能重复出来?
数据在下面
d <- tibble::tribble(
~area, ~group, ~value,
"Texas A&M", "white Students", 0.03,
"Texas A&M", "Black Students", 0.07,
"Umass Amherst", "white Students", 0.07,
"Umass Amherst", "Black Students", 0.23,
"UW-Milwaukee", "white Students", 0.13,
"UW-Milwaukee", "Black Students", 0.31
)
d
提示,可以把图片拉到https://color.adobe.com/zh/create/image-gradient 获取颜色值,比如
colorspace::swatchplot(c("#F42F5D","#252A4A"))
day07
告诉你一个你可能不知道的事情,summarise()一定要输出数据框吗?
iris %>%
nest_by(Species) %>%
rowwise() %>%
summarise(
write_csv(data, glue("{Species}.cvs"))
)
day08
运行以下两个代码,结果和你期望的一样?为什么?
mtcars %>%
group_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt, data = .))
)
mtcars %>%
group_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt))
)
day09
缺失值替换,数值型的缺失值用0替换,字符串型的用""
df <- tibble(
x = c(NA, 1, 2),
y = c("a", NA, NA),
)
day10
六年级的年级主任让学生提交自己所在的班级号,看到结果后,他很苦恼,你能帮忙他规整下?
d <- tibble::tribble(
~id,
"2",
"03",
"小学2015级2班",
"小学2015级3班",
"0601",
"0602",
"201502",
"201604",
"6.10",
"6.11",
"6.5",
"6.8",
"06"
)
d
day11
每行以x为均值生成一个随机数, 以下哪个是正确的?
# A
tibble(x = 1:5) %>%
mutate(normal_mean = rnorm(1, mean = x))
# B
tibble(x = 1:5) %>%
mutate(normal_mean = rnorm(n(), mean = x))
# C
tibble(x = 1:5) %>%
mutate(normal_mean = map_dbl(x, ~rnorm(1, mean = .)))
# D
tibble(x = 1:5) %>%
mutate(normal_mean = map_dbl(x, ~rnorm(1), mean = .))
# E
tibble(x = 1:5) %>%
rowwise() %>%
mutate(normal_mean = rnorm(1, mean = x))
day12
purrr::map()的辅助参数放里面和放外面,有什么区别?
x <- rep(0, 3)
plus <- function(x, y) x + y
map_dbl(x, plus, runif(1))
map_dbl(x, ~plus(.x, runif(1)) )
day13
计算每天水分和食物的所占比例, 比如第一天water和food都是10.0,那么各自比例都是50%.
d <- tibble::tribble(
~water, ~food,
10.0, 10.0,
12.1, 10.3,
13.5, 19.1,
17.4, 16.0,
25.8, 15.6,
27.4, 19.8
)
d
day14
以下代码哪些会给出相同的图形?
tb <- tibble(
x = rep(c(1,2,3), 2),
y = c(1:6),
group = c(rep("group1", 3), rep("group2", 3) )
)
ggplot(tb, aes(x,y)) + geom_line()ggplot(tb, aes(x,y,group=group)) + geom_line()ggplot(tb, aes(x,y,fill=group)) + geom_line()ggplot(tb, aes(x,y,color=group)) + geom_line()
day15
重复这张图
数据在下面
library(tidyverse)
raw_df <- read_rds(here::here("demo_data", "rude_behavior_in_airplane.rds"))
raw_df
day16
library(tidyverse)
genes <- paste0("gene", 1:5) %>% set_names(.)
genes
这里有一个列表,其元素list1, list2, list3是3个长度不等的向量
big_list <- list(
list1 = paste0("gene", c(1:2, 6:7)),
list2 = paste0("gene", c(6:7)),
list3 = paste0("gene", c(1, 4:7))
)
big_list
需求:想看下 [R表达式] 是否出现在 list1, list2, list3中,并统计成下表
day17
统计每支球队,比赛次数以及赢得比赛的分数之和
games <- tibble::tribble(
~team, ~outcome, ~points,
"A", "Win", 3,
"A", "Lose", 1,
"A", "Win", 1,
"A", "Win", 2,
"B", "Win", 1,
"B", "Win", 1,
"B", "Lose", 3,
"B", "Lose", 1
)
games
day18
只让gentoo这个分面,背景色高亮
library(tidyverse)
library(palmerpenguins)
penguins %>%
ggplot(aes(x = bill_length_mm, y = bill_depth_mm)) +
geom_point() +
facet_wrap(vars(species), ncol = 3)
day19
用下面的数据,画出柱中柱效果
tb <- tibble::tribble(
~group, ~product, ~sale,
"Target", "Balender", 80,
"Target", "Fan", 90,
"Target", "Cooler", 70,
"Target", "AC", 95,
"Achieved", "Balender", 50,
"Achieved", "Fan", 55,
"Achieved", "Cooler", 60,
"Achieved", "AC", 45
)
tb
day20
修改列名,在m或者f的后面加下划线
tb <- tibble::tribble(
~id, ~new_ep_m014, ~newrel_f1524, ~new_sp_f65, ~new_ep_m3544,
1L, 3L, 2L, 4L, 1L,
2L, 4L, 4L, 5L, 2L,
3L, 5L, 5L, 6L, 6L,
4L, 6L, 6L, 7L, 8L
)
tb
day21
library(tidyverse)
df <- tibble(
x = 1:10,
y = sample(c("a", "b"), size = 10, replace = TRUE)
)
df
说出这里三行代码分别的含义
df %>%
summarise(
y1 = sum(x),
y2 = sum(y == "a"),
y3 = sum(x[y == "a"])
)
day22
看中这个张图<https://www.healthsystemtracker.org/brief/covid-19-leading-cause-of-death-ranking/>,数据也是可以下载的
day23
问题,这两张图一样吗?
library(dplyr)
library(ggplot2)
df <- data.frame(
x = rnorm(n = 2 * 500),
group = rep(c("1", "2"), each = 500)
)
ggplot(df) +
geom_line(
mapping = aes(x = x, group = group),
stat = "density",
alpha = 0.5
)
ggplot(df) +
stat_density(
mapping = aes(x = x, group = group),
geom = "line",
alpha = 0.5
)
day24
1 == "1"
结果会是什么?
- TRUE
- FALSE
- Error
- NULL
day25
如何让连续在一起的类别,分为一组。比如这里的x变量,分为4组
df <- tibble::tribble(
~x, ~y,
"a", 2,
"a", 3,
"b", 4,
"b", 5,
"a", 1,
"a", 3,
"a", 2,
"b", 3
)
df
day26
都是size = 15 为什么一个大一个小呢?
day27
ww <- c("ab1", "vf2", "aaba2", "ddb76", "d8p")
ww
- 找出数值
- 找出b后紧跟的数值
- 找出b后面出现的数值
# remove the objects
# rm(list=ls())
rm(d, data, df, my_function, text_subtitle, genes, big_list, games)
pacman::p_unload(pacman::p_loaded(), character.only = TRUE)