19 Model building

library(tidyverse) 
library(modelr) 
options(na.action = na.warn) 
 
library(nycflights13) 
library(lubridate)

19.1 第二个模型的部分代码

# 每天的航班数量
daily <- flights %>% 
  mutate(date = make_date(year, month, day)) %>% 
  group_by(date) %>% 
  summarize(n = n()) 
#> `summarise()` ungrouping output (override with `.groups` argument)
daily 
#> # A tibble: 365 x 2
#>   date           n
#>   <date>     <int>
#> 1 2013-01-01   842
#> 2 2013-01-02   943
#> 3 2013-01-03   914
#> 4 2013-01-04   915
#> 5 2013-01-05   720
#> 6 2013-01-06   832
#> # ... with 359 more rows
ggplot(daily, aes(date, n)) + 
  geom_line()

daily <- daily %>% 
  mutate(wday = wday(date, label = TRUE))  # 添加星期变量
# 绘制星期与航班数量的箱线图
ggplot(daily, aes(wday, n)) + 
  geom_boxplot()

mod <- lm(n ~ wday, data = daily) # 拟合模型

grid <- daily %>% 
  data_grid(wday) %>% 
  add_predictions(mod, "n")   # 添加预测值

ggplot(daily, aes(wday, n)) + 
  geom_boxplot() + 
  geom_point(data = grid, color = "red", size = 4)  # 将预测值放在上图中

daily <- daily %>% 
  add_residuals(mod)  # 添加残差
daily %>% 
  ggplot(aes(date, resid)) + 
  geom_ref_line(h = 0) + 
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) + 
  geom_ref_line(h = 0) + 
  geom_line()

daily %>% 
  filter(wday == "周六") %>% 
  ggplot(aes(date, n)) + 
    geom_point() + 
    geom_line() + 
    scale_x_date( 
      NULL, 
      date_breaks = "1 month", 
      date_labels = "%b" 
    )

term <- function(date) { 
  cut(date, 
    breaks = ymd(20130101, 20130605, 20130825, 20140101), 
    labels = c("spring", "summer", "fall") 
  ) # cut将日期转为因子
} 
daily <- daily %>% 
  mutate(term = term(date)) # 划分日期为季节
daily %>% 
  filter(wday == "周六") %>% 
  ggplot(aes(date, n, color = term)) + 
  geom_point(alpha = 1/3) + 
  geom_line() + 
  scale_x_date( 
    NULL, 
    date_breaks = "1 month", 
    date_labels = "%b" 
  )  # 设置x轴的时间标签

daily %>% 
  ggplot(aes(wday, n, color = term)) + 
    geom_boxplot()

mod1 <- lm(n ~ wday, data = daily) 
mod2 <- lm(n ~ wday * term, data = daily) 
 
daily %>% 
  gather_residuals(without_term = mod1, with_term = mod2) %>% 
  ggplot(aes(date, resid, color = model)) + 
    geom_line(alpha = 0.75)

grid <- daily %>% 
  data_grid(wday, term) %>% 
  add_predictions(mod2, "n") 
 
ggplot(daily, aes(wday, n)) + 
  geom_boxplot() + 
  geom_point(data = grid, color = "red") + 
  facet_wrap(~ term)

# 尝试稳壮的线性回归
mod3 <- MASS::rlm(n ~ wday * term, data = daily) 
 
daily %>% 
  add_residuals(mod3, "resid") %>% 
  ggplot(aes(date, resid)) + 
  geom_hline(yintercept = 0, size = 2, color = "white") + 
  geom_line()