## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
library(dplyr)
library(magrittr)
library(msSPChelpR)
#Load synthetic dataset of patients with cancer to demonstrate package functions
data("us_second_cancer")

#This dataset is in long format, so each tumor is a separate row in the data
us_second_cancer

## -----------------------------------------------------------------------------
#filter for lung cancer
ids <- us_second_cancer %>%
  #detect ids with any lung cancer
  filter(t_site_icd == "C34") %>%
  select(fake_id) %>%
  as.vector() %>%
  unname() %>%
  unlist()

filtered_usdata <- us_second_cancer %>%
  #filter according to above detected ids with any lung cancer diagnosis
  filter(fake_id %in% ids) %>%
  arrange(fake_id)

filtered_usdata

## -----------------------------------------------------------------------------
renumbered_usdata <- filtered_usdata %>%
  renumber_time_id(new_time_id_var = "t_tumid", 
                   dattype = "seer",
                   case_id_var = "fake_id")

renumbered_usdata %>%
   select(fake_id, sex, t_site_icd, t_datediag, t_tumid)

## -----------------------------------------------------------------------------
usdata_wide <- renumbered_usdata %>%
  reshape_wide_tidyr(case_id_var = "fake_id", time_id_var = "t_tumid", timevar_max = 10)

#now the data is in the wide format as required by many package functions. 
#This means, each case is a row and several tumors per case ID are 
#add new columns to the data using the time_id as column name suffix.
usdata_wide

## -----------------------------------------------------------------------------

usdata_wide <- usdata_wide %>%
  dplyr::mutate(p_spc = dplyr::case_when(is.na(t_site_icd.2)   ~ "No SPC",
                         !is.na(t_site_icd.2)           ~ "SPC developed",
                         TRUE ~ NA_character_)) %>%
  #create the same information as numeric variable count_spc
  dplyr::mutate(count_spc = dplyr::case_when(is.na(t_site_icd.2)   ~ 1,
                            TRUE ~ 0))
usdata_wide %>%
   dplyr::select(fake_id, sex.1, p_spc, count_spc, t_site_icd.1, 
                 t_datediag.1, t_site_icd.2, t_datediag.2)


## -----------------------------------------------------------------------------
usdata_wide <- usdata_wide %>%
  pat_status(., fu_end = "2017-12-31", dattype = "seer",
             status_var = "p_status", life_var = "p_alive.1",
             spc_var = "p_spc", birthdat_var = "datebirth.1",
             lifedat_var = "datedeath.1", fcdat_var = "t_datediag.1",
             spcdat_var = "t_datediag.2", life_stat_alive = "Alive",
             life_stat_dead = "Dead", spc_stat_yes = "SPC developed",
             spc_stat_no = "No SPC", lifedat_fu_end = "2019-12-31",
             use_lifedatmin = FALSE, check = TRUE, 
             as_labelled_factor = TRUE)

usdata_wide %>%
   dplyr::select(fake_id, p_status, p_alive.1, datedeath.1, t_site_icd.1, t_datediag.1, 
                 t_site_icd.2, t_datediag.2)

#alternatively, you can impute the date of death using lifedatmin_var
usdata_wide %>%
  pat_status(., fu_end = "2017-12-31", dattype = "seer",
             status_var = "p_status", life_var = "p_alive.1",
             spc_var = "p_spc", birthdat_var = "datebirth.1",
             lifedat_var = "datedeath.1", fcdat_var = "t_datediag.1",
             spcdat_var = "t_datediag.2", life_stat_alive = "Alive",
             life_stat_dead = "Dead", spc_stat_yes = "SPC developed",
             spc_stat_no = "No SPC", lifedat_fu_end = "2019-12-31",
             use_lifedatmin = TRUE, lifedatmin_var = "p_dodmin.1", 
             check = TRUE, as_labelled_factor = TRUE)



## -----------------------------------------------------------------------------
usdata_wide <- usdata_wide %>%
  dplyr::filter(!p_status %in% c("NA - Patient not born before end of FU",
                                 "NA - Patient did not develop cancer before end of FU",
                                 "NA - Patient date of death is missing"))

usdata_wide %>%
  dplyr::count(p_status)


## -----------------------------------------------------------------------------
usdata_wide <- usdata_wide %>%
   calc_futime(., futime_var_new = "p_futimeyrs", fu_end = "2017-12-31",
               dattype = "seer", time_unit = "years", 
               lifedat_var = "datedeath.1", 
               fcdat_var = "t_datediag.1", spcdat_var = "t_datediag.2")

usdata_wide %>%
   dplyr::select(fake_id, p_status, p_futimeyrs, p_alive.1, datedeath.1, t_datediag.1, t_datediag.2)


## -----------------------------------------------------------------------------
sircalc_results <- usdata_wide %>%
  sir_byfutime(
    dattype = "seer",
    ybreak_vars = c("race.1", "t_dco.1"),
    xbreak_var = "none",
    futime_breaks = c(0, 1/12, 2/12, 1, 5, 10, Inf),
    count_var = "count_spc",
    refrates_df = us_refrates_icd2,
    calc_total_row = TRUE,
    calc_total_fu = TRUE,
    region_var = "registry.1",
    age_var = "fc_agegroup.1",
    sex_var = "sex.1",
    year_var = "t_yeardiag.1",
    race_var = "race.1",
    site_var = "t_site_icd.1", #using grouping by second cancer incidence
    futime_var = "p_futimeyrs",
    alpha = 0.05)

sircalc_results %>% print(n = 100)


## -----------------------------------------------------------------------------
#The summarize function is versatile. Here for example the summary with minimal output

sircalc_results %>%
  #summarize results across region, age, year and t_site
  summarize_sir_results(.,
                        summarize_groups = c("region", "age", "year", "race"),
                        summarize_site = TRUE,
                        output = "long",  output_information = "minimal",
                        add_total_row = "only",  add_total_fu = "no",
                        collapse_ci = FALSE,  shorten_total_cols = TRUE,
                        fubreak_var_name = "fu_time", ybreak_var_name = "yvar_name",
                        xbreak_var_name = "none", site_var_name = "t_site",
                        alpha = 0.05
                        ) %>%
  dplyr::select(-region, -age, -year, -race, -sex, -yvar_name)

## -----------------------------------------------------------------------------
sessionInfo()