诺贝尔奖分析
探索诺贝尔奖数据
应用篇探索性数据分析(exporatory data analysis)是各种知识的综合运用。本章通过一个案例,讲解探索性数据分析的基本思路,也算是对前面几章内容的一次总结复习。
探索性
- 数据准备(对数据要做到心中有数)
- 描述变量
- 数据结构
- 缺失值及其处理
- 数据探索(围绕探索的目标)
- 数据规整
- 可视化
- 建模
数据集
这是一个诺贝尔奖获得者的数据集,
导入数据
R
library(tidyverse)
library(lubridate)
R
df <- read_csv("./demo_data/nobel_winners.csv")
df
如果是xlsx格式
R
readxl::read_excel("myfile.xlsx")
如果是csv格式
R
readr::read_csv("myfile.csv")
::: {.rmdnote} 这里有个小小的提示:
- 路径(包括文件名), 不要用中文和空格
- 数据框中变量,也不要有中文和空格(可用下划线代替空格)
:::
数据结构
一行就是一个诺奖获得者的记录? 确定?
缺失值及其处理
R
df %>% map_df(~ sum(is.na(.)))
性别缺失怎么造成的?
R
df %>% count(laureate_type)
我们想探索哪些问题?
你想关心哪些问题,可能是
- 每个学科颁过多少次奖?
- 这些大神都是哪个年代的人?
- 性别比例
- 平均年龄和获奖数量
- 最年轻的诺奖获得者是谁?
- 中国诺奖获得者有哪些?
- 得奖的时候多大年龄?
- 获奖者所在国家的经济情况?
- 有大神多次获得诺贝尔奖,而且在不同科学领域获奖?
- 出生地分布?工作地分布?迁移模式?
- GDP经济与诺奖模型?
- 诺奖分享情况?
每个学科颁过多少次奖
R
df %>% count(category)
R
df %>%
count(category) %>%
ggplot(aes(x = category, y = n, fill = category)) +
geom_col() +
geom_text(aes(label = n), vjust = -0.25) +
theme(legend.position = "none")
R
df %>%
count(category) %>%
ggplot(aes(x = fct_reorder(category, n), y = n, fill = category)) +
geom_col() +
geom_text(aes(label = n), vjust = -0.25) +
labs(title = "Number of Nobel prizes in different disciplines") +
theme(legend.position = "none")
也可以使用别人定义好的配色方案
R
library(ggthemr) # install.packages("devtools")
# devtools::install_github('cttobin/ggthemr')
ggthemr("dust")
df %>%
count(category) %>%
ggplot(aes(x = fct_reorder(category, n), y = n, fill = category)) +
geom_col() +
labs(title = "Number of Nobel prizes in different disciplines") +
theme(legend.position = "none")
这个配色方案感觉挺好看的呢,比较适合我这种又挑剔又懒惰的人。
当然,也可以自己DIY,或者使用配色网站的主题方案(https://learnui.design/tools/data-color-picker.html#palette)
R
df %>%
count(category) %>%
ggplot(aes(x = fct_reorder(category, n), y = n)) +
geom_col(fill = c("#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600")) +
labs(title = "Number of Nobel prizes in different disciplines") +
theme(legend.position = "none")
让图骚动起来吧
R
library(gganimate) # install.packages("gganimate", dependencies = T)
df %>%
count(category) %>%
mutate(category = fct_reorder(category, n)) %>%
ggplot(aes(x = category, y = n)) +
geom_text(aes(label = n), vjust = -0.25) +
geom_col(fill = c("#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600")) +
labs(title = "Number of Nobel prizes in different disciplines") +
theme(legend.position = "none") +
transition_states(category) +
shadow_mark(past = TRUE)
和ggplot2的分面一样,动态图可以增加数据展示的维度。
看看我们伟大的祖国
R
df %>%
dplyr::filter(birth_country == "China") %>%
dplyr::select(full_name, prize_year, category)
我们发现获奖者有多个地址,就会有重复的情况,比如 Charles Kuen Kao在2009年Physics有两次,为什么重复计数了呢?
下面我们去重吧, 去重可以用distinct()函数
R
dt <- tibble::tribble(
~x, ~y, ~z,
1, 1, "a",
1, 1, "b",
1, 2, "c",
1, 2, "d"
)
dt
R
dt %>% distinct_at(vars(x), .keep_all = T)
R
dt %>% distinct_at(vars(x, y), .keep_all = T)
R
nobel_winners <- df %>%
mutate_if(is.character, tolower) %>%
distinct_at(vars(full_name, prize_year, category), .keep_all = TRUE) %>%
mutate(
decade = 10 * (prize_year %/% 10),
prize_age = prize_year - year(birth_date)
)
nobel_winners
{BLOCK EDA-NOBEL-20, TYPE="DANGER"}
这是时候,我们才对数据有了一个初步的了解
再来看看我的祖国
R
nobel_winners %>%
dplyr::filter(birth_country == "china") %>%
dplyr::select(full_name, prize_year, category)
哪些大神多次获得诺贝尔奖
R
nobel_winners %>% count(full_name, sort = T)
R
nobel_winners %>%
group_by(full_name) %>%
mutate(
number_prize = n(),
number_cateory = n_distinct(category)
) %>%
arrange(desc(number_prize), full_name) %>%
dplyr::filter(number_cateory == 2)
大神在得奖的时候是多大年龄?
R
nobel_winners %>%
count(prize_age) %>%
ggplot(aes(x = prize_age, y = n)) +
geom_col()
R
nobel_winners %>%
group_by(category) %>%
summarise(mean_prize_age = mean(prize_age, na.rm = T))
R
nobel_winners %>%
mutate(category = fct_reorder(category, prize_age, median, na.rm = TRUE)) %>%
ggplot(aes(category, prize_age)) +
geom_point() +
geom_boxplot() +
coord_flip()
R
nobel_winners %>%
dplyr::filter(!is.na(prize_age)) %>%
group_by(decade, category) %>%
summarize(
average_age = mean(prize_age),
median_age = median(prize_age)
) %>%
ggplot(aes(decade, average_age, color = category)) +
geom_line()
R
library(ggridges)
nobel_winners %>%
ggplot(aes(
x = prize_age,
y = category,
fill = category
)) +
geom_density_ridges()
他们60多少岁才得诺奖,大家才23或24岁,还年轻,不用焦虑喔。
R
nobel_winners %>%
ggplot(aes(x = prize_age, fill = category, color = category)) +
geom_density() +
facet_wrap(vars(category)) +
theme(legend.position = "none")
有同学说要一个个的画,至于group_split()函数,下次课在讲
R
nobel_winners %>%
group_split(category) %>%
map(
~ ggplot(data = .x, aes(x = prize_age)) +
geom_density() +
ggtitle(.x$category)
)
也可以用强大的group_by() + group_map()组合,我们会在相关章节讲到
R
nobel_winners %>%
group_by(category) %>%
group_map(
~ ggplot(data = .x, aes(x = prize_age)) +
geom_density() +
ggtitle(.y)
)
性别比例
R
nobel_winners %>%
dplyr::filter(laureate_type == "individual") %>%
count(category, gender) %>%
group_by(category) %>%
mutate(prop = n / sum(n))
各年代性别比例
R
nobel_winners %>%
dplyr::filter(laureate_type == "individual") %>%
# mutate(decade = glue::glue("{round(prize_year - 1, -1)}s")) %>%
count(decade, category, gender) %>%
group_by(decade, category) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(decade, category, fill = prop)) +
geom_tile(size = 0.7) +
# geom_text(aes(label = scales::percent(prop, accuracy = .01))) +
geom_text(aes(label = scales::number(prop, accuracy = .01))) +
facet_grid(vars(gender)) +
scale_fill_gradient(low = "#FDF4E9", high = "#834C0D")
R
library(ggbeeswarm) # install.packages("ggbeeswarm")
nobel_winners %>%
ggplot(aes(
x = category,
y = prize_age,
colour = gender,
alpha = gender
)) +
ggbeeswarm::geom_beeswarm() +
coord_flip() +
scale_color_manual(values = c("#BB1288", "#5867A6")) +
scale_alpha_manual(values = c(1, .4)) +
theme_minimal() +
theme(legend.position = "top") +
labs(
title = "Gender imbalance of Nobel laureates",
subtitle = "data frome 1901-2016",
colour = "Gender",
alpha = "Gender",
y = "age in prize"
)
R
nobel_winners %>%
count(decade,
category,
gender = coalesce(gender, laureate_type)
) %>%
group_by(decade, category) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(decade, n, fill = gender)) +
geom_col() +
facet_wrap(~category) +
labs(
x = "Decade",
y = "# of nobel prize winners",
fill = "Gender",
title = "Nobel Prize gender distribution over time"
)
这些大神都是哪个年代出生的人?
R
nobel_winners %>%
select(category, birth_date) %>%
mutate(year = floor(year(birth_date) / 10) * 10) %>%
count(category, year) %>%
dplyr::filter(!is.na(year)) %>%
ggplot(aes(x = year, y = n)) +
geom_col() +
scale_x_continuous(breaks = seq(1810, 1990, 20)) +
geom_text(aes(label = n), vjust = -0.25) +
facet_wrap(vars(category))
课堂练习,哪位同学能把图弄得好看些?
最年轻的诺奖获得者?
R
nobel_winners %>%
dplyr::filter(prize_age == min(prize_age, na.rm = T))
R
nobel_winners %>%
dplyr::filter(
rank(prize_year - year(birth_date)) == 1
)
R
nobel_winners %>%
arrange(
prize_year - year(birth_date)
)
R
nobel_winners %>%
top_n(1, year(birth_date) - prize_year)
平均年龄和获奖数量
R
df1 <- nobel_winners %>%
group_by(category) %>%
summarise(
mean_prise_age = mean(prize_age, na.rm = T),
total_num = n()
)
df1
R
df1 %>%
ggplot(aes(mean_prise_age, total_num)) +
geom_point(aes(color = category)) +
geom_smooth(method = lm, se = FALSE)
出生地与工作地分布
R
nobel_winners_clean <- nobel_winners %>%
mutate_at(
vars(birth_country, death_country),
~ ifelse(str_detect(., "\\("), str_extract(., "(?<=\\().*?(?=\\))"), .)
) %>%
mutate_at(
vars(birth_country, death_country),
~ case_when(
. == "scotland" ~ "united kingdom",
. == "northern ireland" ~ "united kingdom",
str_detect(., "czech") ~ "czechia",
str_detect(., "germany") ~ "germany",
TRUE ~ .
)
) %>%
select(full_name, prize_year, category, birth_date, birth_country, gender, organization_name, organization_country, death_country)
R
nobel_winners_clean %>% count(death_country, sort = TRUE)
迁移模式
R
nobel_winners_clean %>%
mutate(
colour = case_when(
death_country == "united states of america" ~ "#FF2B4F",
death_country == "germany" ~ "#fcab27",
death_country == "united kingdom" ~ "#3686d3",
death_country == "france" ~ "#88398a",
death_country == "switzerland" ~ "#20d4bc",
TRUE ~ "gray60"
)
) %>%
ggplot(aes(
x = 0,
y = fct_rev(factor(birth_country)),
xend = death_country,
yend = 1,
colour = colour,
alpha = (colour != "gray60")
)) +
geom_curve(
curvature = -0.5,
arrow = arrow(length = unit(0.01, "npc"))
) +
scale_x_discrete() +
scale_y_discrete() +
scale_color_identity() +
scale_alpha_manual(values = c(0.1, 0.2), guide = F) +
scale_size_manual(values = c(0.1, 0.4), guide = F) +
theme_minimal() +
theme(
panel.grid = element_blank(),
plot.background = element_rect(fill = "#F0EFF1", colour = "#F0EFF1"),
legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)
)
地图
R
library(here)
library(sf)
library(countrycode)
# countrycode('Albania', 'country.name', 'iso3c')
nobel_winners_birth_country <- nobel_winners_clean %>%
count(birth_country) %>%
filter(!is.na(birth_country)) %>%
mutate(ISO3 = countrycode(birth_country,
origin = "country.name", destination = "iso3c"
))
global <-
sf::st_read("./demo_data/worldmap/TM_WORLD_BORDERS_SIMPL-0.3.shp") %>%
st_transform(4326)
global %>%
full_join(nobel_winners_birth_country, by = "ISO3") %>%
ggplot() +
geom_sf(aes(fill = n),
color = "white",
size = 0.1
) +
labs(
x = NULL, y = NULL,
title = "Nobel Winners by country",
subtitle = "color of map indicates number of Nobel lauretes",
fill = "num of Nobel lauretes",
caption = "Made: wang_minjie"
) +
scale_fill_gradientn(colors = c("royalblue1", "magenta", "orange", "gold"), na.value = "white") +
# scale_fill_gradient(low = "wheat1", high = "red") +
theme_void() +
theme(
legend.position = c(0.1, 0.3),
plot.background = element_rect(fill = "gray")
)
R
# Determine to 10 Countries
topCountries <- nobel_winners_clean %>%
count(birth_country, sort = TRUE) %>%
na.omit() %>%
top_n(8)
topCountries
R
df4 <- nobel_winners_clean %>%
filter(birth_country %in% topCountries$birth_country) %>%
group_by(birth_country, category, prize_year) %>%
summarise(prizes = n()) %>%
mutate(cumPrizes = cumsum(prizes))
df4
R
library(gganimate)
df4 %>%
mutate(prize_year = as.integer(prize_year)) %>%
ggplot(aes(x = birth_country, y = category, color = birth_country)) +
geom_point(aes(size = cumPrizes), alpha = 0.6) +
# geom_text(aes(label = cumPrizes)) +
scale_size_continuous(range = c(2, 30)) +
transition_reveal(prize_year) +
labs(
title = "Top 10 countries with Nobel Prize winners",
subtitle = "Year: {frame_along}",
y = "Category"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 22),
axis.title = element_blank()
) +
scale_color_brewer(palette = "RdYlBu") +
theme(legend.position = "none") +
theme(plot.margin = margin(5.5, 5.5, 5.5, 5.5))
出生地和工作地不一样的占比
R
nobel_winners_clean %>%
select(category, birth_country, death_country) %>%
mutate(immigration = if_else(birth_country == death_country, 0, 1))
诺奖分享者
R
nobel_winners %>%
separate(prize_share, into = c("num", "deno"), sep = "/", remove = FALSE)
R
nobel_winners %>%
filter(category == "medicine") %>%
mutate(
num_a = as.numeric(str_sub(prize_share, 1, 1)),
num_b = as.numeric(str_sub(prize_share, -1)),
share = num_a / num_b,
year = prize_year %% 10,
decade = 10 * (prize_year %/% 10)
) %>%
group_by(prize_year) %>%
mutate(n = row_number()) %>%
ggplot() +
geom_col(aes(x = "", y = share, fill = as.factor(n)),
show.legend = FALSE
) +
coord_polar("y") +
facet_grid(decade ~ year, switch = "both") +
labs(title = "Annual Nobel Prize sharing") +
theme_void() +
theme(
plot.title = element_text(face = "bold", vjust = 8),
strip.text.x = element_text(
size = 7,
margin = margin(t = 5)
),
strip.text.y = element_text(
size = 7,
angle = 180, hjust = 1, margin = margin(r = 10)
)
)
其它
没有回答的问题,大家自己花时间探索下。
延伸阅读
- 有些图可以再美化下
R
# remove the objects
rm(df, df1, df4, dt, global, nobel_winners, nobel_winners_birth_country, nobel_winners_clean, scale_color_continuous, scale_color_discrete, scale_color_gradient, topCountries)
R
pacman::p_unload(pacman::p_loaded(), character.only = TRUE)