## ----warning=FALSE, results='hide',message=FALSE------------------------------
## library packages
library(gompertztrunc)         ## calculate mortality differentials under double-truncation  
library(tidyverse)             ## data manipulation and visualization  
library(data.table)            ## fast data manipulation
library(cowplot)               ## publication-ready themes for ggplot
library(socviz)                ## helper functions for data visualization (Kieran Healy)  
library(broom)                 ## "tidy" model output 


## load data 
sim_data <- sim_data             ## simulated 
bunmd_demo <- bunmd_demo         ## real 
numident_demo <- gompertztrunc::numident_demo   ## real 

## -----------------------------------------------------------------------------
## Look at simulated data 
head(sim_data)

## What years do we have mortality coverage? 
sim_data %>% 
  summarize(min(dyear), max(dyear)) 

## -----------------------------------------------------------------------------
## run gompertz_mle function  
## returns a list 
simulated_example <- gompertz_mle(formula = aod ~ temp + as.factor(sex) + as.factor(isSouth),
                                                 left_trunc = 1888,
                                                 right_trunc = 1905,
                                                 data = sim_data)

## -----------------------------------------------------------------------------
## 1. starting value for coefficients (from linear regression)
simulated_example$starting_values

## 2. optim fit object
simulated_example$optim_fit

## 2. check model convergence (0 == convergence)
simulated_example$optim_fit$convergence

## 3. Look at model results 
simulated_example$results

## -----------------------------------------------------------------------------
## true coefficient values (we know because we simulated them)
mycoefs <- c("temp" = +.2, "sex" = -.5, "isSouth" = +.6)

## compare
simulated_example$results %>%
  filter(!stringr::str_detect(parameter, "gompertz")) %>%
  mutate(true_coef = mycoefs) %>%
  select(parameter, coef, coef_lower, coef_upper, true_coef)

## -----------------------------------------------------------------------------
## translate hazard rates to difference in e65
convert_hazards_to_ex(simulated_example$results, age = 65, use_model_estimates = T) %>% 
  select(parameter, hr, hr_lower, hr_upper, e65, e65_lower, e65_upper)

## -----------------------------------------------------------------------------
## look at data 
head(bunmd_demo)

## how many people per country? 
bunmd_demo %>%
  count(bpl_string)

## ----fig.width = 6, fig.height = 4--------------------------------------------
## distribution of deaths?
ggplot(data = bunmd_demo) + 
  geom_histogram(aes(x = death_age),
                 fill = "grey",
                 color = "black",
                 binwidth = 1) + 
  cowplot::theme_cowplot() + 
  labs(x = "Age of Death",
       y = "N") + 
  facet_wrap(~bpl_string)

## -----------------------------------------------------------------------------
## run linear model 
lm_bpl <- lm(death_age ~ bpl_string + as.factor(byear), data = bunmd_demo)

## extract coefficients from model 
lm_bpl_tidy <- tidy(lm_bpl) %>%
  filter(str_detect(term, "bpl_string")) %>%
  mutate(term = prefix_strip(term, "bpl_string"))

## rename variables 
lm_bpl_tidy <- lm_bpl_tidy %>%
  mutate(
    e65 = estimate,
    e65_lower = estimate - 1.96 * std.error,
    e65_upper = estimate + 1.96 * std.error
  ) %>%
  rename(country = term) %>%
  mutate(method = "Regression on Age of Death")

## -----------------------------------------------------------------------------
## run gompertztrunc
## set truncation bounds to 1988-2005 because we are using BUNMD 
gompertz_mle_results <- gompertz_mle(formula = death_age ~ bpl_string, 
                                    left_trunc = 1988,
                                    right_trunc = 2005,
                                    data = bunmd_demo)

## convert to e65
## use model estimates — but can also set other defaults for Gompertz M and b. 
mle_results <- convert_hazards_to_ex(gompertz_mle_results$results, use_model_estimates = T)

## tidy up results 
mle_results <- mle_results %>% 
  rename(country = parameter) %>%
  filter(str_detect(country, "bpl_string")) %>%
  mutate(country = prefix_strip(country, "bpl_string")) %>%
  mutate(method = "Gompertz Parametric Estimate")

## look at results 
mle_results

## ----fig.width = 7.2, fig.height = 5------------------------------------------
## combine results from both models 
bpl_results <- lm_bpl_tidy %>%
  bind_rows(mle_results)

## calculate adjustment factor (i.e., how much bigger are Gompertz MLE results)
adjustment_factor <- bpl_results %>% 
  select(country, method, e65) %>%
  pivot_wider(names_from = method, values_from = e65) %>%
  mutate(adjustment_factor = `Gompertz Parametric Estimate` / `Regression on Age of Death`) %>%
  summarize(adjustment_factor_mean = round(mean(adjustment_factor), 3)) %>%
  as.vector()

## plot results
bpl_results %>%
  bind_rows(mle_results) %>%
  ggplot(aes(y = reorder(country, e65), x = e65, xmin = e65_lower, xmax = e65_upper, color = method)) +
  geom_pointrange(position = position_dodge(width = 0.2), shape = 1) +
  cowplot::theme_cowplot(font_size = 12) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  labs(
    x = "Estimate",
    title = "Foreign-Born Male Ages at Death, BUNMD 1905-1914",
    y = "",
    subtitle = paste0("Gompertz MLE estimates are ~", adjustment_factor, " times larger than regression on age of death")
  ) +
  scale_color_brewer(palette = "Set1") +
  annotate("text", label = "Native Born Whites", x = 0.1, y = 3, angle = 90, size = 3, color = "black")

## -----------------------------------------------------------------------------
## create the dataset
bunmd_1915_cohort <- bunmd_demo %>% 
  filter(byear == 1915, death_age >= 65) %>% 
  filter(bpl_string %in% c("Native Born White", "Mexico"))

## run gompertz_mle()
bpl_results_1915_cohort <- gompertz_mle(formula = death_age ~ bpl_string, 
                                   data = bunmd_1915_cohort,
                                   left_trunc = 1988, 
                                   right_trunc = 2005)

## look at results
bpl_results_1915_cohort$results

## ----fig.width = 7, fig.height = 3.5------------------------------------------
diagnostic_plot(object = bpl_results_1915_cohort, data = bunmd_1915_cohort,
                covar = "bpl_string", death_var = "death_age")

## ----fig.width = 7, fig.height = 4--------------------------------------------
diagnostic_plot_hazard(object = bpl_results_1915_cohort, data = bunmd_1915_cohort,
                covar = "bpl_string", death_var = "death_age", xlim=c(65,95))

## -----------------------------------------------------------------------------
## load in file 
numident_demo <- numident_demo

## recode categorical education variable to continuous "years of education" 
numident_demo <- numident_demo %>% 
  mutate(educ_yrs = case_when(
    educd == "No schooling completed" ~ 0,
    educd == "Grade 1" ~ 1,
    educd == "Grade 2" ~ 2,
    educd == "Grade 3" ~ 3,
    educd == "Grade 4" ~ 4,
    educd == "Grade 5" ~ 5,
    educd == "Grade 6" ~ 6,
    educd == "Grade 7" ~ 7,
    educd == "Grade 8" ~ 8,
    educd == "Grade 9" ~ 9,
    educd == "Grade 10" ~ 10,
    educd == "Grade 11" ~ 11,
    educd == "Grade 12" ~ 12,
    educd == "Grade 12" ~ 12,
    educd == "1 year of college" ~ 13,
    educd == "2 years of college" ~ 14,
    educd == "3 years of college" ~ 15,
    educd == "4 years of college" ~ 16,
    educd == "5+ years of college" ~ 17
  ))

## restrict to men 
data_numident_men <- numident_demo %>% 
  filter(sex == "Male") %>% 
  filter(byear %in% 1910:1920 & death_age > 65)


## -----------------------------------------------------------------------------
## look at person-level weights 
head(data_numident_men$weight)

## run gompertz model with person weights
education_gradient <- gompertz_mle(formula = death_age ~ educ_yrs, 
                                   data = data_numident_men,
                                   weights = weight, ## specify person-level weights 
                                   left_trunc = 1988, 
                                   right_trunc = 2005)

## look at results 
education_gradient$results 

## translate to e65
mle_results_educ <- convert_hazards_to_ex(education_gradient$results, use_model_estimates = T, age = 65) %>% 
  mutate(method = "Parametric Gompertz MLE")

## look at results
mle_results_educ

## ----fig.width = 7.2, fig.height = 5------------------------------------------
## run linear model 
lm_bpl <- lm(death_age ~ educ_yrs + as.factor(byear), data = data_numident_men, weights = weight)

## extract coefficients from model 
lm_bpl_tidy <- tidy(lm_bpl) %>%
  filter(str_detect(term, "educ_yrs"))

## rename variables 
ols_results <- lm_bpl_tidy %>%
  mutate(
    e65 = estimate,
    e65_lower = estimate - 1.96 * std.error,
    e65_upper = estimate + 1.96 * std.error
  ) %>%
  rename(parameter = term) %>%
  mutate(method = "Regression on Age of Death")

## Plot results
education_plot <- ols_results %>%
  bind_rows(mle_results_educ) %>%
  mutate(parameter = "Education (Years) Regression Coefficient") %>% 
  ggplot(aes(x = method, y = e65, ymin = e65_lower, ymax = e65_upper)) +
  geom_pointrange(position = position_dodge(width = 0.2), shape = 1) +
  cowplot::theme_cowplot(font_size = 12) +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  labs(
    x = "",
    title = "Association Education (Years) and Longevity",
    subtitle = "Men, CenSoc-Numident 1910-1920",
    y = ""
  ) +
  scale_color_brewer(palette = "Set1") + 
  ylim(0, 0.5) 

education_plot