Tidyverse 技巧 — 数据科学中的 R 语言

count()

我之前多次用到count()函数，其功能就是统计某个变量中各组出现的次数

R

df <- tibble(
  name = c("Alice", "Alice", "Bob", "Bob", "Carol", "Carol"),
  type = c("english", "math", "english", "math", "english", "math"),
  score = c(60.2, 90.5, 92.2, 98.8, 82.5, 74.6)
)

df

R

df %>% count(name)

如果用之前讲的group_by() + summarise()来写，

R

df %>% 
  group_by(name) %>% 
  summarise( n = n())

count() 还有更多强大的参数，比如

R

df %>% count(name,
  sort = TRUE,
  wt = score,
  name = "total_score"
)

如果不用count()，用group_by() + summarise()写，

R

df %>%
  group_by(name) %>%
  summarise(
    n = n(),
    total_score = sum(score, na.rm = TRUE)
  ) %>%
  arrange(desc(total_score))

当然，count()在特定场合下的简便写法，遇到复杂的分组统计，还是得用用group_by() + summarise()组合。

在 count() 中创建新变量

可以在count()里构建新变量，并利用这个新变量完成统计

R

df %>% count(range = 10 * (score %/% 10))

add_count()

想增加一列，代表每人参加的考试次数

R

df %>%
  group_by(name) %>%
  mutate(n = n()) %>%
  ungroup()

可以有更简单的方法

R

df %>% add_count(name)

nth(), first(), last()

R

v <- c("a", "c", "d", "k")

R

v[1]
v[length(v)]

R

c("a", "c", "d", "k") %>% nth(3)

R

c("a", "c", "d", "k") %>% first()
c("a", "c", "d", "k") %>% last()

用在数据框中，同样可以使用

R

df %>%
  filter(score == first(score))

R

df %>%
  group_by(name) %>%
  filter(score == last(score))

列变量重新排序

比如想把score放在第一列

R

df %>%
  select(score, everything())

这个方法，对列变量较多的情形非常适用。

if_else

R

df %>% mutate(
  assess = if_else(score &gt; 85, "very_good", "good")
)

case_when

R

df %>% mutate(
  assess = case_when(
    score &lt; 70 ~ "general",
    score &gt;= 70 &amp; score &lt; 80 ~ "good",
    score &gt;= 80 &amp; score &lt; 90 ~ "very_good",
    score &gt;= 90 ~ "best",
    TRUE ~ "other"
  )
)

找出前几名

R

df %>%
  top_n(2, score)

去除多余的空白

R

library(stringr)

str_trim(" excess    whitespace in a string be gone!")

R

# Use str_squish() to remove any leading, trailing, or excess whitespace
str_squish(" excess    whitespace in a string be gone!")

取反操作

R

3:10 %in% c(1:5)

有时候需要一个不属于的操作符，可以自定义一个不属于操作符

R

`%nin%` <- Negate(`%in%`)
3:10 %nin% c(1:5)

或者使用purrr::negate()自定义反向操作符

R

`%nin%` <- purrr::negate(`%in%`)
3:10 %nin% c(1:5)

drop_na()

R

dt <- tribble(
  ~x, ~y,
  1, NA,
  2, NA,
  NA, -3,
  NA, -4,
  5, -5
)

dt

R

dt %>% drop_na()
# dt %>% drop_na(x)

replace_na()

R

dt <- tribble(
  ~x, ~y,
  1, NA,
  2, NA,
  NA, -3,
  NA, -4,
  5, -5
)

dt %>% mutate(x = replace_na(x, 0))

R

dt %>% mutate(
  x = replace_na(x, mean(x, na.rm = TRUE))
  )

之前讲正则表达式也有类似的函数stringr::str_replace_na()，

coalesce

R

dt <- tribble(
  ~x, ~y,
  1, NA,
  2, NA,
  NA, -3,
  NA, -4,
  5, -5
)

dt %>% mutate(
  z = coalesce(x, 0)
  # z = coalesce(x, y)
)

有时候，我们可能为了减少信息丢失，想填充NA

R

dt <- tribble(
  ~name, ~age,
  "a", 1,
  "b", 2,
  "c", NA,
  "d", 2
)


dt %>%
  mutate(
    age_adj = ifelse(is.na(age), mean(age, na.rm = TRUE), age)
  )

summarise() 生成 list-column

summarize()会生成一个value，

R

library(gapminder)
gapminder %>%
  group_by(continent) %>%
  summarise(
    avg_gdpPercap = mean(gdpPercap)
  )

summarize()也可以生成一个list，比如下面做单样本的t检验

R

library(gapminder)
gapminder %>%
  group_by(continent) %>%
  summarise(test = list(t.test(gdpPercap))) %>% 

  mutate(tidied = purrr::map(test, broom::tidy)) %>%
  unnest(tidied) %>%
  ggplot(aes(estimate, continent)) +
  geom_point() +
  geom_errorbarh(aes(
    xmin = conf.low,
    xmax = conf.high
  ))

或者线性回归

R

gapminder %>%
  group_by(continent) %>%
  summarise(test = list(lm(lifeExp ~ gdpPercap))) %>% 

  mutate(tidied = purrr::map(test, broom::tidy, conf.int = TRUE)) %>%
  unnest(tidied) %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(estimate, continent)) +
  geom_point() +
  geom_errorbarh(aes(
    xmin = conf.low,
    xmax = conf.high,
    height = .3
  ))

以下两种方法，同样完成上面的工作，具体方法会在相关章节介绍

R

gapminder %>%
  group_nest(continent) %>%
  mutate(test = map(data, ~ t.test(.x$gdpPercap))) %>%
  mutate(tidied = map(test, broom::tidy)) %>%
  unnest(tidied)

R

gapminder %>%
  group_by(continent) %>%
  group_modify(
    ~ broom::tidy(t.test(.x$gdpPercap))
  )

count() + fct_reorder() + geom_col() + coord_flip()

最好用的四件套

R

gapminder %>%
  distinct(continent, country) %>%
  count(continent) %>%
  ggplot(aes(x = continent, y = n)) +
  geom_col()

R

gapminder %>%
  distinct(continent, country) %>%
  count(continent) %>%
  ggplot(aes(x = fct_reorder(continent, n), y = n)) +
  geom_col() +
  coord_flip()

画图容易，但画出一张好图并不容易

R

gapminder %>%
  distinct(continent, country) %>%
  count(continent) %>% 
  mutate(coll = if_else(continent == "Asia", "red", "gray")) %>% 


  ggplot(aes(x = fct_reorder(continent, n), y = n)) +
  geom_text(aes(label = n), hjust = -0.25) +
  geom_col(width = 0.8, aes(fill = coll) ) +
  coord_flip() +
  theme_classic() +
  scale_fill_manual(values = c("#b3b3b3a0", "#D55E00")) +
  theme(legend.position = "none",
        axis.text = element_text(size = 11)
        ) +
  labs(title = "My title", x = "")

或者偷懒，将continent == "Asia"的结果直接赋值给aes(fill = ___ )，效果与上面是一样的。

R

gapminder %>%
  distinct(continent, country) %>%
  count(continent) %>% 


  ggplot(aes(x = fct_reorder(continent, n), y = n)) +
  geom_text(aes(label = n), hjust = -0.25) +
  geom_col(width = 0.8, aes(fill = continent == "Asia") ) +
  coord_flip() +
  theme_classic() +
  scale_fill_manual(values = c("#b3b3b3a0", "#D55E00")) +
  annotate("text", x = 3.8, y = 48, label = "this is important\ncase", 
           color = "#D55E00", size = 5) +
  annotate(
    geom = "curve", x = 4.1, y = 48, xend = 4.1, yend = 35, 
    curvature = .3, arrow = arrow(length = unit(2, "mm"))
  ) +
  theme(legend.position = "none",
        axis.text = element_text(size = 11)
        ) +
  labs(title = "My title", x = "")

scale_x/y_log10

现实世界很多满足对数规则

各国人均GDP
各国人口
不同人士的收入
公司的营业额

R

gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

R

gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  scale_x_log10() # A better way to log transform

fct_lump

门诊病症的流水记录： "Stuffy nose （鼻塞）", "Runny（流涕）", "Fever（发热）", "Diarrhea（腹泻）", "Vomiting（呕吐）", "Cough（咳嗽）", "Sore throat（咽痛）", "Fatigue（乏力）", "Abdominal pain（腹痛）", "Delusion（妄想）", "Auditory hallucination（幻听）", "Insomnia（失眠）", "Anemia（贫血）", "Hyperactivity（多动）", "Chest pain（胸痛）", "Chest tightness（胸闷）",

R

tb <- tibble::tribble(
  ~disease, ~n,
  "Stuffy nose", 112,
  "Runny", 130,
  "Fever", 89,
  "Diarrhea", 5,
  "Vomiting", 12,
  "Cough", 102,
  "Sore throat", 98,
  "Fatigue", 15,
  "Abdominal pain", 2,
  "Delusion", 3,
  "Auditory hallucination", 6,
  "Insomnia", 1,
  "Anemia", 8,
  "Hyperactivity", 2,
  "Chest pain", 4,
  "Chest tightness", 5
)

R

p1 <- tb %>% 
  uncount(n) %>% 

  ggplot(aes(x = disease, fill = disease)) +
  geom_bar() +
  coord_flip() +
  theme(legend.position = "none")



p2 <- tb %>% 
  uncount(n) %>% 
  mutate(
    disease = forcats::fct_lump(disease, 5),
    disease = forcats::fct_reorder(disease, .x = disease, .fun = length)
  ) %>% 
  ggplot(aes(x = disease, fill = disease)) +
  geom_bar() +
  coord_flip() +
  theme(legend.position = "none")

R

p1 + p2

fct_reoder2

让图例的顺序与图的曲线顺序一致

R

dat_wide <- tibble(
  x = 1:3,
  top = c(4.5, 4, 5.5),
  middle = c(4, 4.75, 5),
  bottom = c(3.5, 3.75, 4.5)
)


dat_wide %>%
  pivot_longer(
    cols = c(top, middle, bottom),
    names_to = "region",
    values_to = "awfulness")


dat <- dat_wide %>%
  pivot_longer(
    cols = c(top, middle, bottom),
    names_to = "region",
    values_to = "awfulness") %>%
  mutate(
    region_ABCD = factor(region),
    region_sane = fct_reorder2(region, x, awfulness)
  )

p_ABCD <- ggplot(dat, aes(x, awfulness, colour = region_ABCD)) +
  geom_line() + theme(legend.justification = c(1, 0.85))

p_sane <- ggplot(dat, aes(x, awfulness, colour = region_sane)) +
  geom_line() + theme(legend.justification = c(1, 0.85))

p_ABCD + p_sane +
  plot_annotation(
    title = 'Make the legend order = data order, with forcats::fct_reorder2()')

unite

R

dfa <- tribble(
   ~school, ~class,
  "chuansi", "01",
  "chuansi", "02",
  "shude",   "07",
  "shude",   "08",
  "huapulu", "101",
  "huapulu", "103"
)

dfa

R

df_united <- dfa %>% 
   tidyr::unite(school, class, col = "school_plus_class", sep = "_", remove = FALSE)

df_united

当然，简单的情况也可以用mutate()实现

R

dfa %>% mutate(newcol = str_c(school, "_", class))

separate()

R

df_united %>%
  tidyr::separate(school_plus_class, into = c("sch", "cls"), sep = "_", remove = F)

如果用mutate()来实现，语句就会比较复杂些

R

df_united %>% 
  mutate(sch = str_split(school_plus_class, "_") %>% map_chr(1)) %>% 
  mutate(cls = str_split(school_plus_class, "_") %>% map_chr(2))

如果每行不是都恰好分隔成两部分呢？就需要tidyr::extract(), 使用方法和tidyr::separate()类似

R

dfb <- tribble(
   ~school_class,
  "chuansi_01",
  "chuansi_02_03",
  "shude_07_0",
  "shude_08_0",
  "huapulu_101_u",
  "huapulu_103__p"
)
dfb

R

dfb %>% tidyr::separate(school_class, 
                into = c("sch", "cls"), 
                sep = "_", 
                extra = "drop",
                remove = F)

extract()

有时候分隔符搞不定的，可以用正则表达式，将捕获的每组弄成一列

R

dfc <- tibble(x = c("1-12week", "1-10wk", "5-12w", "01-05weeks"))
dfc

R

dfc %>% tidyr::extract(
  x,
  c("start", "end", "letter"), "(\\d+)-(\\d+)([a-z]+)",
  remove = FALSE
)

crossing()

先看看效果

R

tidyr::crossing(x = c("F", "M"), y = c("a", "b"), z = c(1:2))

这个函数在数据模拟的时候很方便，

R

tidyr::crossing(trials = 1:10, m = 1:5) %>%
  group_by(trials) %>%
  mutate(
    guess = sample.int(5, n()),
    result = m == guess
  ) %>%
  summarise(score = sum(result) / n())

再来一个例子

R

sim <- tribble(
  ~f, ~params,
  "rbinom", list(size = 1, prob = 0.5, n = 10)
)
sim %>%
  mutate(sim = invoke_map(f, params))

R

rep_sim <- sim %>%
  crossing(rep = 1:1e5) %>%
  mutate(sim = invoke_map(f, params)) %>%
  unnest(sim) %>%
  group_by(rep) %>%
  summarise(mean_sim = mean(sim))

head(rep_sim)

R

rep_sim %>% 
  ggplot(aes(x = mean_sim)) +
  geom_histogram(binwidth = 0.05,  fill = "skyblue") +
  theme_classic()

也可用在较复杂的模拟，比如下面介绍的大数极限定理，

R

sim <- tribble(
  ~n_tosses, ~f, ~params,
     10, "rbinom", list(size = 1, prob = 0.5, n = 15),
     30, "rbinom", list(size = 1, prob = 0.5, n = 30),
    100, "rbinom", list(size = 1, prob = 0.5, n = 100),
   1000, "rbinom", list(size = 1, prob = 0.5, n = 1000),
  10000, "rbinom", list(size = 1, prob = 0.5, n = 1e4)
)
sim_rep <- sim %>%
  crossing(replication = 1:50) %>%
  mutate(sims = invoke_map(f, params)) %>%
  unnest(sims) %>%
  group_by(replication, n_tosses) %>%
  summarise(avg = mean(sims))

R

sim_rep %>%
  ggplot(aes(x = factor(n_tosses), y = avg)) +
  ggbeeswarm::geom_quasirandom(color = "lightgrey") +
  scale_y_continuous(limits = c(0, 1)) +
  geom_hline(
    yintercept = 0.5,
    color = "skyblue", lty = 1, size = 1, alpha = 3 / 4
  ) +
  ggthemes::theme_pander() +
  labs(
    title = "50 Replicates Of Mean 'Heads' As Number Of Tosses Increase",
    y = "mean",
    x = "Number Of Tosses"
  )

数值模拟我们会在相关章节专门介绍。

作业

新建一列ratio，当sign为"positive"时，ratio等于 A除以B，当sign为"negative"时，ratio等于 B除以A

R

tb <- tibble::tribble(
  ~A, ~B, ~sign,
  100L, 50L, "positive",
  50L, 100L, "negative",
  100L, 50L, "positive",
  50L, 100L, "negative"
)
tb

用:分隔y列，并且只要前4个，构成新的数据框，并给列名c("e1", "e2", "e3", "e4")

R

df <- tibble( 
  x = 1:2,
  y = c("A1:A2:A3:A4:A5:A6",  "B1:B2:B3:B4:B5:B6")
  )
df

最好的办法

R

df %>% 
  separate(y, sep = ":", into = c("e1", "e2", "e3", "e4", "e5", "e6"), remove = FALSE) %>%
  select(1:6)

其他的方法逻辑清晰, 也不错

R

str_split_fixed(df$y, ":", 6) %>%                   # return matrix
  # or str_split(df$y, ":", simplify = TRUE, n = 6) # return matrix
  as_tibble() %>%
  select(1:4) %>%
  set_names(c("e1", "e2", "e3", "e4"))

R

df %>%
  mutate(
    new = str_split(y, pattern = ":", simplify = FALSE, n = 6) %>% 
      map(., ~.x[1:4]) # return list-column: a list of vectors
  ) %>%
  unnest_wider(col = new) %>%
  set_names(c("x", "y", "e1", "e2", "e3", "e4"))

笨办法

R

df %>%
  mutate(
    e1 = str_split(y, pattern = ":", simplify = FALSE, n = 6) %>% map_chr(1) 
  ) %>%
  mutate(
    e2 = str_split(y, pattern = ":", simplify = FALSE, n = 6) %>% map_chr(2) 
  )  %>%
  mutate(
    e3 = str_split(y, pattern = ":", simplify = FALSE, n = 6) %>% map_chr(3) 
  )  %>%
  mutate(
    e4 = str_split(y, pattern = ":", simplify = FALSE, n = 6) %>% map_chr(4) 
  )

R

# remove the objects
# rm(list=ls())
rm(`%nin%`, dat, dat_wide, df, df_united, dfa, dfb, dfc, dt, long, p_ABCD, p_sane, p1, p2, plant_heigt, rep_sim, sim, sim_rep, tb, v, wide)

R

pacman::p_unload(pacman::p_loaded(), character.only = TRUE)