69

贝叶斯逻辑回归

logistic 与 binomial 模型

贝叶斯篇
R
library(tidyverse)
library(tidybayes)
library(rstan)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
theme_set(bayesplot::theme_default())

企鹅案例

筛选出物种为"Gentoo"的企鹅,并构建gender变量,male 对应1,female对应0

R
library(palmerpenguins)
gentoo <- penguins %>%
  filter(species == "Gentoo", !is.na(sex)) %>% 
  mutate(gender = if_else(sex == "male", 1, 0))
gentoo

dotplots

借鉴ggdist的Logit dotplots 的画法,画出dotplot

R
gentoo %>%
  ggplot(aes(x = body_mass_g, y = sex, side = ifelse(sex == "male", "bottom", "top"))) +
  geom_dots(scale = 0.5) +
  ggtitle(
    "geom_dots(scale = 0.5)",
    'aes(side = ifelse(sex == "male", "bottom", "top"))'
  )

$$ \begin{align} y_i & = \text{bernoulli}( p_i) \\ p_i & =\text{logit}^{-1}(X_i \beta) \end{align} $$

bayesian logit模型

R
stan_program <- &quot;
data {
  int&lt;lower=0&gt; N;
  vector[N] x;
  int&lt;lower=0,upper=1&gt; y[N];
  int&lt;lower=0&gt; M;
  vector[M] new_x;  
}
parameters {
  real alpha;
  real beta;
}
model {
  // more efficient and arithmetically stable
  y ~ bernoulli_logit(alpha + beta * x);
}
generated quantities {
  vector[M] y_epred; 
  vector[M] mu = alpha + beta * new_x;

  for(i in 1:M) {
    y_epred[i] = inv_logit(mu[i]);
  }
   
}
&quot;

newdata <- data.frame(
    body_mass_g = seq(min(gentoo$body_mass_g), max(gentoo$body_mass_g), length.out = 100)
   ) 


stan_data <- list(
  N = nrow(gentoo),
  y = gentoo$gender, 
  x = gentoo$body_mass_g,
  M = nrow(newdata),
  new_x = newdata$body_mass_g
)

m <- stan(model_code = stan_program, data = stan_data)
R
fit <- m %>%
  tidybayes::gather_draws(y_epred[i]) %>%
  ggdist::mean_qi(.value)
fit

两个图画在一起

R
fit %>% 
  bind_cols(newdata) %>% 
  ggplot(aes(x = body_mass_g)) +
  geom_dots(
    data = gentoo,
    aes(y = gender, side = ifelse(sex == "male", "bottom", "top")),
    scale = 0.4
  ) +
  geom_lineribbon(
    aes(y = .value, ymin = .lower, ymax = .upper), 
    alpha = 1/4, 
    fill = "#08306b"
  ) +
  labs(
    title = "logit dotplot: stat_dots() with stat_lineribbon()",
    subtitle = 'aes(side = ifelse(sex == "male", "bottom", "top"))',
    x = "Body mass (g) of Gentoo penguins",
    y = "Pr(sex = male)"
  )

篮球案例

我们模拟100个选手每人投篮20次,假定命中概率是身高的线性函数,案例来源chap15.3 of [Regression and Other Stories] (page270).

R
n <- 100

data <-
  tibble(size   = 20,
         height = rnorm(n, mean = 72, sd = 3)) %>% 
  mutate(y = rbinom(n, size = size, p = 0.4 + 0.1 * (height - 72) / 3))

head(data)

常规做法

R
fit_glm <- glm(
  cbind(y, 20-y) ~ height, family = binomial(link = "logit"),
  data = data
)
fit_glm

stan 代码

$$ \begin{align} y_i & = \text{Binomial}(n_i, p_i) \\ p_i & =\text{logit}^{-1}(X_i \beta) \end{align} $$

R
stan_program <- &quot;
data {
  int&lt;lower=0&gt; N;
  int&lt;lower=0&gt; K;
  matrix[N, K] X;
  int&lt;lower=0&gt; y[N];
  int trials[N];
}
parameters {
  vector[K] beta;
}
model {
  
  for(i in 1:N) {
    target += binomial_logit_lpmf(y[i] | trials[i], X[i] * beta);
  }
  
}
&quot;


stan_data <- data %>%
  tidybayes::compose_data(
    N      = n,
    K      = 2,
    y      = y,
    trials = size,
    X      = model.matrix(~ 1 + height)
  )

fit <- stan(model_code = stan_program, data = stan_data)
R
fit
R
# remove the objects
# rm(list=ls())
rm(gentoo, fit, m, stan_data, stan_program, data, fit_glm, fit, n)
R
ggplot2::theme_set(ggplot2::theme_grey())
pacman::p_unload(pacman::p_loaded(), character.only = TRUE)