## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(dpi = 300, fig.width = 6.5, fig.height = 3, out.width = "650px")

## ---- include = FALSE---------------------------------------------------------
t0 <- Sys.time()

## ---- message = FALSE, warning = FALSE----------------------------------------
set.seed(08544)
library(gapclosing)
library(dplyr)
library(ggplot2)

## -----------------------------------------------------------------------------
simulated_data <- generate_simulated_data(n = 1000)
head(simulated_data)

## -----------------------------------------------------------------------------
estimate <- gapclosing(
  data = simulated_data,
  counterfactual_assignments = 1,
  outcome_formula = formula(outcome ~ confounder + category*treatment),
  treatment_formula = formula(treatment ~ confounder + category),
  category_name = "category",
  se = TRUE,
  # You can process the bootstrap in parallel with as many cores as available
  parallel_cores = 2
)

## ---- include = FALSE---------------------------------------------------------
plots <- plot(estimate, return_plots = TRUE)

## ---- echo = FALSE, fig.width = 5, fig.height = 3.5, out.width = "650px", fig.cap = "Figure 1 produced by plot() function"----
print(plots[[1]] +
        ggtitle("First result of a call to plot()"))

## ---- echo = FALSE, fig.width = 5, fig.height = 3.5, out.width = "650px", fig.cap = "Figure 1 produced by plot() function"----
disparityplot(estimate, category_A = "A", category_B = "B") +
  ggtitle("A disparityplot()")

## ---- include = FALSE-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
old <- options()
options(width = 300)

## ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
summary(estimate)

## ---- include = FALSE---------------------------------------------------------
options(old)

## ---- echo = FALSE, fig.height = 2--------------------------------------------
for_DAG <- data.frame(label = c("Category","Confounder","Treatment","Outcome","Unobserved"),
                      x = c(1,2,3,4,1),
                      y = c(1,1,1,1,0))
for_DAG %>%
  ggplot(aes(x = x, y = y, label = label)) +
  # Edges coming out of category
  annotate(geom = "segment",
         x = 1, y = 1, xend = 1.55, yend = 1,
         arrow = arrow(length = unit(.1,"in"))) +
  annotate(geom = "curve",
           x = 1, y = 1, xend = c(2.9,3.9), yend = c(1.2,1.2),
           arrow = arrow(length = unit(.1,"in")),
           curvature = -.3) +
  # Edges coming out of confounder
  annotate(geom = "segment",
           x = 2, y = 1, xend = 2.65, yend = 1,
           arrow = arrow(length = unit(.1,"in"))) +
  annotate(geom = "curve",
           x = 2, y = 1, xend = 3.75, yend = .82,
           arrow = arrow(length = unit(.1,"in")),
           curvature = .25) +
  # Edges coming out of treatment
  annotate(geom = "segment",
           x = 3, y = 1, xend = 3.65, yend = 1,
           color = "blue", size = 1.05,
           arrow = arrow(length = unit(.13,"in"))) +
  # Edges coming out of unobserved
  annotate(geom = "segment",
           x = 1, y = 0, xend = c(1,1.9), yend = .8,
           arrow = arrow(length = unit(.1,"in"))) +
  annotate(geom = "curve",
           x = 1, y = 0, xend = c(3.9), yend = .8,
           arrow = arrow(length = unit(.1,"in")),
           curvature = .25) +
  geom_label(color = "white") +
  geom_label(data = for_DAG %>% filter(label %in% c("Category","Confounder"))) +
  geom_text(data = for_DAG %>% filter(label %in% c("Unobserved","Treatment","Outcome"))) +
  ylim(c(-.3,2)) +
  xlim(c(0,5)) +
  theme_void()

## ---- warning = FALSE---------------------------------------------------------
estimate_gam <- gapclosing(
  data = simulated_data,
  counterfactual_assignments = 1,
  outcome_formula = formula(outcome ~ s(confounder) + category*treatment),
  treatment_formula = formula(treatment ~ s(confounder) + category),
  category_name = "category",
  treatment_algorithm = "gam",
  outcome_algorithm = "gam",
  sample_split = "cross_fit"
  # Note: Standard errors with `se = TRUE` are supported.
  # They are omitted here only to speed vignette build time.
)

## -----------------------------------------------------------------------------
estimate_ranger <- gapclosing(
  data = simulated_data,
  counterfactual_assignments = 1,
  outcome_formula = formula(outcome ~ confounder + category),
  treatment_formula = formula(treatment ~ confounder + category),
  category_name = "category",
  treatment_algorithm = "ranger",
  outcome_algorithm = "ranger",
  sample_split = "cross_fit"
  # Note: Standard errors with `se = TRUE` are supported.
  # They are omitted here only to speed vignette build time.
)

## ---- include = FALSE---------------------------------------------------------
glm_plot <- plot(estimate, return_plots = T)[[1]]
gam_plot <- plot(estimate_gam, return_plots = T)[[1]]
ranger_plot <- plot(estimate_ranger, return_plots = T)[[1]]

## ---- echo = FALSE, figures-side, fig.show="hold", out.width="30%", message = FALSE, warning = FALSE----
glm_plot +
    ggtitle("GLM estimate") +
    ylim(c(-1.5,1.5)) +
  theme(legend.position = "bottom")
gam_plot +
    ggtitle("GAM estimate") +
    ylim(c(-1.5,1.5)) +
  theme(legend.position = "bottom")
ranger_plot +
  ggtitle("Random forest estimate") +
  ylim(c(-1.5,1.5)) +
  theme(legend.position = "bottom")

## -----------------------------------------------------------------------------
estimate_stochastic <- gapclosing(
  data = simulated_data,
  counterfactual_assignments = .75,
  outcome_formula = formula(outcome ~ confounder + category*treatment),
  treatment_formula = formula(treatment ~ confounder + category),
  category_name = "category"
)

## ---- echo = FALSE, fig.caption = "A conceptual figure for which the package does not provide a general-purpose function."----
counterfactual_assignments_values <- seq(0,1,.25)
many_stochastic_estimates <- rep(NA, length(counterfactual_assignments_values))
for (i in 1:length(counterfactual_assignments_values)) {
  estimate_case <- gapclosing(
    data = simulated_data,
    counterfactual_assignments = counterfactual_assignments_values[i],
    outcome_formula = formula(outcome ~ confounder + category*treatment),
    treatment_formula = formula(treatment ~ confounder + category),
    category_name = "category"
  )
  estimate_we_want <- estimate_case$change_disparities %>%
    filter(change_type == "proportional") %>%
    filter(category == "B - A")
  many_stochastic_estimates[i] <- estimate_we_want$estimate
}
data.frame(counterfactual_assignments = counterfactual_assignments_values,
           estimate = many_stochastic_estimates) %>%
  ggplot(aes(x = counterfactual_assignments,
             y = estimate)) +
  geom_hline(yintercept = c(0,1),
             color = "gray", linetype = "dashed") +
  geom_line(color = "gray") +
  geom_point(aes(color = factor(counterfactual_assignments == .75))) +
  xlab("Counterfactual Treatment Assignment Probability") +
  scale_y_continuous(name = "Percent of B - A Gap Closed",
                     labels = function(x) paste0(round(100*x),"%")) +
  annotate(geom = "text",
           x = 1, y = c(0,1),
           label = c("No change to disparity",
                     "Disparity completely eliminated"),
           color = "darkgray", size = 3, hjust = 1, vjust = 1.5) +
  annotate(geom = "text",
           x = .1, y = .3, 
           label = "Disparities eliminated under various\nstochastic\ninterventions",
           color = "gray", size = 3, hjust = 0, vjust = 1) +
  annotate(geom = "text",
           x = .75, y = .3,
           label = "Gap-Closing Estimand\nCalculated Above",
           color = "blue", size = 3, vjust = -.5) +
  scale_color_manual(values = c("gray","blue")) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        legend.position = "none")

## -----------------------------------------------------------------------------
our_assignments <- case_when(simulated_data$category == "A" ~ .5,
                             simulated_data$category == "B" ~ .4, 
                             simulated_data$category == "C" ~ .3)
estimate_stochastic <- gapclosing(
  data = simulated_data,
  counterfactual_assignments = our_assignments,
  outcome_formula = formula(outcome ~ confounder + category*treatment),
  treatment_formula = formula(treatment ~ confounder + category),
  category_name = "category"
)