职业决策分析
数据驱动的职业选择
应用篇预备知识
R
library(tidyverse)
example <-
tibble::tribble(
~name, ~english, ~chinese, ~math, ~sport, ~psy, ~edu,
"A", 133, 100, 102, 56, 89, 89,
"B", 120, 120, 86, 88, 45, 75,
"C", 98, 109, 114, 87, NA, 84,
"D", 120, 78, 106, 68, 86, 69,
"E", 110, 99, 134, 98, 75, 70,
"F", NA, 132, 130, NA, 68, 88
)
example
缺失值检查
我们需要判断每一列的缺失值
R
example %>%
summarise(
na_in_english = sum(is.na(english)),
na_in_chinese = sum(is.na(chinese)),
na_in_math = sum(is.na(math)),
na_in_sport = sum(is.na(sport)),
na_in_psy = sum(is.na(math)), # tpyo here
na_in_edu = sum(is.na(edu))
)
我们发现,这种写法比较笨,而且容易出错,比如na_in_psy = sum(is.na(math)) 就写错了。那么有没有既偷懒又安全的方法呢?有的。但代价是需要学会across()函数,大家可以在Console中输入?dplyr::across查看帮助文档,或者看相关章节。
R
example %>%
summarise(
across(everything(), mean)
)
example %>%
summarise(
across(everything(), function(x) sum(is.na(x)) )
)
数据预处理
- 直接丢弃缺失值所在的行
R
example %>% drop_na()
- 用均值代替缺失值
R
d <- example %>%
mutate(
across(where(is.numeric), ~ if_else(is.na(.), mean(., na.rm = T), .))
)
d
- 计算总分/均值
R
d %>%
rowwise() %>%
mutate(
total = sum(c_across(-name))
)
d %>%
rowwise() %>%
mutate(
mean = mean(c_across(-name))
)
- 数据标准化处理
R
standard <- function(x) {
(x - mean(x)) / sd(x)
}
R
d %>%
mutate(
across(where(is.numeric), standard)
)
开始
文件管理中需要注意的地方
感谢康钦虹同学提供的数据,但这里有几点需要注意的地方:
| 事项 | 问题 | 解决办法 |
|---|---|---|
| 文件名 | excel的文件名是中文 | 用英文,比如 data.xlsx |
| 列名 | 列名中有-号,大小写不统一 | 规范列名,或用janitor::clean_names()偷懒 |
| 预处理 | 直接在原始数据中新增 | 不要在原始数据上改动,统计工作可以在R里实现 |
| 文件管理 | 没有层级 | 新建data文件夹装数据,与code.Rmd并列 |
R
data <- readxl::read_excel("demo_data/career-decision.xlsx", skip = 1) %>%
janitor::clean_names()
#glimpse(data)
R
d <- data %>% select(1:61)
#glimpse(d)
缺失值检查
R
d %>%
summarise(
across(everything(), ~sum(is.na(.)))
)
没有缺失值,挺好
数据预处理
采用利克特式 5 点计分... (这方面你们懂得比我多)
R
d <- d %>%
rowwise() %>%
mutate(
environment_exploration = sum(c_across(z1:z5)),
self_exploration = sum(c_across(z6:z9)),
objective_system_exploration = sum(c_across(z10:z15)),
info_quantity_exploration = sum(c_across(z16:z18)),
self_evaluation = sum(c_across(j1:j6)),
information_collection = sum(c_across(j7:j15)),
target_select = sum(c_across(j16:j24)),
formulate = sum(c_across(j25:j32)),
problem_solving = sum(c_across(j33:j39)),
career_exploration = sum(c_across(z1:z18)),
career_decision_making = sum(c_across(j1:j39))
) %>%
select(-starts_with("z"), -starts_with("j")) %>%
ungroup() %>%
mutate(pid = 1:n(), .before = sex) %>%
mutate(
across(c(pid, sex, majoy, grade, from), as_factor)
)
#glimpse(d)
标准化
R
standard <- function(x) {
(x - mean(x)) / sd(x)
}
d <- d %>%
mutate(
across(where(is.numeric), standard)
)
d
探索
想探索的问题
- 不同性别(或者年级,生源地,专业)下,各指标分值的差异性
- 两个变量的相关分析和回归分析
- 更多(欢迎大家提出了喔)
男生女生在职业探索上有所不同?
以性别为例。因为性别变量是男女,仅仅2组,所以检查男女在各自指标上的均值差异,可以用T检验。
R
d %>%
group_by(sex) %>%
summarise(
across(where(is.numeric), mean)
)
你可以给这个图颜色弄得更好看点?
R
library(ggridges)
d %>%
ggplot(aes(x = career_exploration, y = sex, fill = sex)) +
geom_density_ridges()
R
t_test_eq <- t.test(career_exploration ~ sex, data = d, var.equal = TRUE) %>%
broom::tidy()
t_test_eq
R
t_test_uneq <- t.test(career_exploration ~ sex, data = d, var.equal = FALSE) %>%
broom::tidy()
t_test_uneq
当然,也可以用相关章节介绍的统计推断的方法
R
library(infer)
obs_diff <- d %>%
specify(formula = career_exploration ~ sex) %>%
calculate("diff in means", order = c("1", "2"))
obs_diff
R
null_dist <- d %>%
specify(formula = career_exploration ~ sex) %>%
hypothesize(null = "independence") %>%
generate(reps = 5000, type = "permute") %>%
calculate(stat = "diff in means", order = c("1", "2"))
null_dist
R
null_dist %>%
visualize() +
shade_p_value(obs_stat = obs_diff, direction = "two_sided")
R
null_dist %>%
get_p_value(obs_stat = obs_diff, direction = "two_sided") %>%
#get_p_value(obs_stat = obs_diff, direction = "less") %>%
mutate(p_value_clean = scales::pvalue(p_value))
也可以用tidyverse的方法一次性的搞定所有指标
R
d %>%
pivot_longer(
cols = -c(pid, sex, majoy, grade, from),
names_to = "index",
values_to = "value"
) %>%
group_by(index) %>%
summarise(
broom::tidy( t.test(value ~ sex, data = cur_data()))
) %>%
select(index, estimate, statistic, p.value) %>%
arrange(p.value)
来自不同地方的学生在职业探索上有所不同?
以生源地为例。因为生源地有3类,所以可以使用方差分析。
R
aov(career_exploration ~ from, data = d) %>%
TukeyHSD(which = "from") %>%
broom::tidy()
R
library(ggridges)
d %>%
ggplot(aes(x = career_exploration, y = from, fill = from)) +
geom_density_ridges()
也可以一次性的搞定所有指标
R
d %>%
pivot_longer(
cols = -c(pid, sex, majoy, grade, from),
names_to = "index",
values_to = "value"
) %>%
group_by(index) %>%
summarise(
broom::tidy( aov(value ~ from, data = cur_data()))
) %>%
select(index, term, statistic, p.value) %>%
filter(term != "Residuals") %>%
arrange(p.value)
职业探索和决策之间有关联?
可以用相关章节线性模型来探索
R
lm(career_decision_making ~ career_exploration, data = d)
不要因为我讲课讲的很垃圾,就错过了R的美,瑕不掩瑜啦。要相信自己,你们是川师研究生中最聪明的。
R
# remove the objects
# rm(list=ls())
rm(d, data, example, null_dist, obs_diff, standard, t_test_eq, t_test_uneq)
R
pacman::p_unload(pacman::p_loaded(), character.only = TRUE)