across 之美(三)
across 与 c_across
Tidyverse 篇有同学说across()函数只能在summarise()或者mutate()中使用,事实上能使用across()的函数还是挺多的。我们列举一些看看。
R
library(tidyverse)
library(palmerpenguins)
penguins <- palmerpenguins::penguins %>% drop_na()
用在mutate()中
R
penguins %>%
mutate(
across(where(is.numeric), log),
across(where(is.character), as.factor)
)
用在summarise()中
R
penguins %>%
group_by(species) %>%
summarise(
across(starts_with("bill_length_"), mean),
Area = mean(bill_length_mm * bill_depth_mm),
across(starts_with("bill_depth_"), min)
)
R
penguins %>%
select(species, sex, bill_length_mm) %>%
summarise(
mean = mean(bill_length_mm),
across(-bill_length_mm)
)
用在group_by()中
R
penguins %>%
group_by(across(c(species, island, sex))) %>%
summarise(
across(bill_length_mm, mean, na.rm = TRUE)
)
R
penguins %>%
group_by(across(where(is.factor))) %>%
summarise(
across(bill_length_mm, mean, na.rm = TRUE)
)
R
sum_group_vars <- function(df, group_vars, sum_vars) {
df %>%
group_by(across({{ group_vars }})) %>%
summarise(n = n(),
across({{ sum_vars }},
list(mean = mean, sd = sd))
)
}
penguins %>%
sum_group_vars(
c(species, year), c(bill_length_mm, bill_depth_mm)
)
用在filter()中
R
df <- tibble(
a = letters[1:5],
b = 1:5,
c = 6:10,
d = 11:15
)
df %>%
dplyr::filter(
across(where(is.numeric), .fns = ~ .x > 2)
)
R
# 等价
df %>%
dplyr::filter(
if_all(where(is.numeric), .fns = ~ .x > 2)
)
用在distinct()中
R
penguins %>%
distinct(
across(c(island, species))
)
用在arrange()中
R
penguins %>%
arrange(across(bill_length_mm))
R
penguins %>%
arrange(across(ends_with("_mm")))
R
f <- function(.data, order_by) {
.data %>%
arrange(across({{order_by}}))
}
penguins %>%
f(sex)
用在count()
R
penguins %>%
count(across(sex))
R
penguins %>%
count(
across(where(is.factor))
)
用在自定义的函数里,挺方便
R
count_multiple <- function(df, ...) {
df %>%
select(...) %>%
names() %>%
map( ~ count(df, across(all_of(.x)), sort = TRUE))
}
penguins %>%
count_multiple(where(is.factor))
用在purrr::map()中
我们想求行方向的均值,根据相关章节介绍的技术
R
tibble(
x = 1:3,
y = 2:4
) %>%
rowwise() %>%
mutate(
min = mean(c_across())
)
根据相关章节介绍函数式编程
R
tibble(
x = 1:3,
y = 2:4
) %>%
pmap_dfr(
~list(z = mean(c(...)))
)
事实上,我们还可以这样写,
R
tibble(
x = 1:3,
y = 2:4
) %>%
mutate(
z = pmap_dbl(across(), lift_vd(mean))
)
或者利用mutate()数据框并入
R
tibble(
x = 1:3,
y = 2:4
) %>%
mutate(
pmap_dfr(across(), ~list(z = mean(c(...))))
)
再举一个例,我想求出数据框每一行的多个统计值,也可以用到数据框并入
R
df <- tibble(
a = letters[1:5],
b = 1:5,
c = 6:10,
d = 11:15
)
df %>%
mutate(
pmap_dfr(across(b:d), ~lst(min = min(c(...)),
max = max(c(...)),
ratio = min/max
)
)
)
再比如例子,一行中,将最大值出现后的所有数值替换成0
R
df <- tibble(
x = c(55, 23, 15, 10),
y = c(42, NA, 90, 30),
z = c(12, 17, 10, 12),
w = c(NA, 45, NA, NA)
)
df
df %>% mutate(
pmap_dfr(
across(everything()),
~ `[<-`(c(...), seq_along(c(...)) > which.max(c(...)), 0))
)
也可以这样写
R
myfun <- function(x) {
x[seq_along(x) > which.max(x)] <- 0
return(x)
}
df %>% mutate(
pmap_dfr(
across(everything()),
~ myfun(c(...))
)
)
更多案例请看相关章节。
R
# remove the objects
# ls() %>% stringr::str_flatten(collapse = ", ")
#rm(cutoffs, d1, d2, df, mult, std, weights, replace_col_max)
R
pacman::p_unload(pacman::p_loaded(), character.only = TRUE)