42

across 之美(三)

across 与 c_across

Tidyverse 篇

有同学说across()函数只能在summarise()或者mutate()中使用,事实上能使用across()的函数还是挺多的。我们列举一些看看。

R
library(tidyverse)
library(palmerpenguins)

penguins <- palmerpenguins::penguins %>% drop_na()

用在mutate()

R
penguins %>%
  mutate(
    across(where(is.numeric), log),
    across(where(is.character), as.factor)
  )

用在summarise()

R
penguins %>% 
  group_by(species) %>%
  summarise(
    across(starts_with("bill_length_"), mean),
    Area = mean(bill_length_mm * bill_depth_mm),
    across(starts_with("bill_depth_"), min)
  )
R
penguins %>% 
  select(species, sex, bill_length_mm) %>% 
  summarise(
    mean = mean(bill_length_mm),
    across(-bill_length_mm)
  )

用在group_by()

R
penguins %>% 
  group_by(across(c(species, island, sex))) %>% 
  summarise(
   across(bill_length_mm, mean, na.rm = TRUE)
  )
R
penguins %>% 
  group_by(across(where(is.factor))) %>% 
  summarise(
    across(bill_length_mm, mean, na.rm = TRUE)
  )
R
sum_group_vars <- function(df, group_vars, sum_vars) {
  df %>% 
    group_by(across({{ group_vars }})) %>% 
    summarise(n = n(), 
              across({{ sum_vars }}, 
                     list(mean = mean, sd = sd))
              )
}

penguins %>% 
  sum_group_vars(
    c(species, year), c(bill_length_mm, bill_depth_mm)
  )

用在filter()

R
df <- tibble(
  a = letters[1:5],
  b = 1:5,
  c = 6:10,
  d = 11:15
)


df %>%
  dplyr::filter(
    across(where(is.numeric), .fns = ~ .x &gt; 2)
  )
R
# 等价
df %>%
  dplyr::filter(
    if_all(where(is.numeric), .fns = ~ .x &gt; 2)
  )

用在distinct()

R
penguins %>% 
  distinct(
    across(c(island, species))
  )

用在arrange()

R
penguins %>% 
  arrange(across(bill_length_mm))
R
penguins %>% 
  arrange(across(ends_with("_mm")))
R
f <- function(.data, order_by) {
  .data %>%
    arrange(across({{order_by}}))
}

penguins %>% 
  f(sex)

用在count()

R
penguins %>% 
  count(across(sex))
R
penguins %>% 
  count(
    across(where(is.factor))
  )

用在自定义的函数里,挺方便

R
count_multiple <- function(df, ...) {
  df %>% 
    select(...) %>% 
    names() %>% 
    map( ~ count(df, across(all_of(.x)), sort = TRUE))
}

penguins %>%
  count_multiple(where(is.factor))

用在purrr::map()

我们想求行方向的均值,根据相关章节介绍的技术

R
tibble(
  x = 1:3,
  y = 2:4
) %>% 
  rowwise() %>% 
  mutate(
    min = mean(c_across())
  )

根据相关章节介绍函数式编程

R
tibble(
  x = 1:3,
  y = 2:4
) %>% 
  pmap_dfr(
    ~list(z = mean(c(...)))
  )

事实上,我们还可以这样写,

R
tibble(
  x = 1:3,
  y = 2:4
) %>% 
  mutate(
    z = pmap_dbl(across(), lift_vd(mean))
  )

或者利用mutate()数据框并入

R
tibble(
  x = 1:3,
  y = 2:4
) %>% 
  mutate(
    pmap_dfr(across(), ~list(z = mean(c(...))))
  )

再举一个例,我想求出数据框每一行的多个统计值,也可以用到数据框并入

R
df <- tibble(
  a = letters[1:5],
  b = 1:5,
  c = 6:10,
  d = 11:15
)

df %>% 
  mutate(
    pmap_dfr(across(b:d), ~lst(min = min(c(...)), 
                               max = max(c(...)), 
                               ratio = min/max
                               )
    )
  )

再比如例子,一行中,将最大值出现后的所有数值替换成0

R
df <- tibble(
  x = c(55, 23, 15, 10),
  y = c(42, NA, 90, 30),
  z = c(12, 17, 10, 12),
  w = c(NA, 45, NA, NA)
)
df

df %>% mutate(
  pmap_dfr(
    across(everything()), 
    ~ `[<-`(c(...), seq_along(c(...)) &gt; which.max(c(...)), 0))
)

也可以这样写

R
myfun <- function(x) {
  x[seq_along(x) &gt; which.max(x)] <- 0
  return(x)
}

df %>% mutate(
  pmap_dfr(
    across(everything()), 
   ~ myfun(c(...))
  )
)

更多案例请看相关章节。

R
# remove the objects
# ls() %>% stringr::str_flatten(collapse = ", ")

#rm(cutoffs, d1, d2, df, mult, std, weights, replace_col_max)
R
pacman::p_unload(pacman::p_loaded(), character.only = TRUE)