--- title: "Stepwise Selection for Fine-Gray Competing Risks Models" author: "Ravi Varadhan, Deborah Kuk" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{crrstep-tutorial} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ``` ## Introduction The crrstep package provides stepwise variable selection for the Fine-Gray proportional subdistribution hazards model for competing risks data. This vignette demonstrates how to use the package with both simple and complex formula specifications. ## Installation ```{r eval=FALSE} install.packages('crrstep') ``` ```{r message=FALSE, warning=FALSE} library(crrstep) library(cmprsk) ``` ## Simulated Data ```{r} set.seed(123) n <- 400 age <- rnorm(n, mean = 60, sd = 10) sex <- factor(sample(c('Male', 'Female'), n, replace = TRUE)) treatment <- factor(sample(c('Control', 'Treatment'), n, replace = TRUE)) biomarker <- rnorm(n, mean = 100, sd = 20) linpred <- 0.03 * (age - 60) + 0.5 * (sex == 'Male') - 0.4 * (treatment == 'Treatment') + 0.01 * biomarker base_hazard <- 0.1 true_time <- rexp(n, rate = base_hazard * exp(linpred)) p_cause1 <- plogis(linpred) cause <- ifelse(runif(n) < p_cause1, 1, 2) censor_time <- rexp(n, rate = 0.05) ftime <- pmin(true_time, censor_time) fstatus <- ifelse(ftime == censor_time, 0, cause) sim_data <- data.frame(ftime, fstatus, age, sex, treatment, biomarker) table(sim_data$fstatus) ``` ## Backward Selection with AIC ```{r} result_back <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_back) ``` ## Forward Selection with AIC ```{r} result_fwd <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'forward', criterion = 'AIC', trace = TRUE ) print(result_fwd) ``` ## Different Selection Criteria ```{r} result_bic <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = FALSE ) cat('AIC selected variables:', nrow(result_back$coefficients), '\\n') cat('BIC selected variables:', nrow(result_bic$coefficients), '\\n') ``` ## Forcing Variables with scope.min ```{r} result_min <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~ age + sex, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_min) ``` ## Factor Variables ```{r} set.seed(456) sim_data$stage <- factor(sample(c('I', 'II', 'III', 'IV'), n, replace = TRUE)) sim_data$region <- factor(sample(c('North', 'South', 'East', 'West'), n, replace = TRUE)) result_factors <- crrstep( formula = ftime ~ age + sex + stage + region, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = TRUE ) summary(result_factors) coef(result_factors) logLik(result_factors) BIC(result_factors) ``` ## Interaction Terms ```{r} result_interact <- crrstep( formula = ftime ~ age + sex + treatment + age:sex + sex:treatment, scope.min = ~ age + sex, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_interact) ``` ## Polynomial Terms ```{r} sim_data$biomarker_c <- scale(sim_data$biomarker, scale = FALSE)[,1] result_poly <- crrstep( formula = ftime ~ age + sex + biomarker_c + I(biomarker_c^2), scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_poly) ``` ## Polynomial-Factor Interactions ```{r} result_complex <- crrstep( formula = ftime ~ age + treatment + biomarker_c + I(biomarker_c^2) + biomarker_c:treatment + I(biomarker_c^2):treatment, scope.min = ~ age, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = TRUE ) print(result_complex) ``` ## Log Transformations ```{r} sim_data$biomarker_pos <- sim_data$biomarker - min(sim_data$biomarker) + 1 result_log <- crrstep( formula = ftime ~ age + sex + log(biomarker_pos) + log(biomarker_pos):sex, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_log) ``` ## Three-Way Interactions ```{r} result_threeway <- crrstep( formula = ftime ~ age + sex + treatment + stage + sex:treatment + sex:stage + treatment:stage + sex:treatment:stage, scope.min = ~ age, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = TRUE ) print(result_threeway) ``` ## Full CRR Object ```{r} fit_full <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = FALSE, crr.object = TRUE ) cat('Coefficients:\\n') print(fit_full$coef) cat('\\nLog-likelihood:\\n') print(fit_full$loglik) ``` ## Missing Data ```{r} sim_data_miss <- sim_data sim_data_miss$age[sample(1:n, 20)] <- NA sim_data_miss$biomarker[sample(1:n, 15)] <- NA result_miss <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data_miss, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_miss) ``` ## Using Subset ```{r} result_subset <- crrstep( formula = ftime ~ age + treatment + biomarker, scope.min = ~1, etype = fstatus, subset = sim_data$sex == 'Female', data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = FALSE ) print(result_subset) ``` ## Best Practices 1. Center continuous variables when using polynomial terms 2. Include main effects in scope.min when testing interactions 3. Use BIC or BICcr for more parsimonious models 4. Check for multicollinearity before modeling ## Troubleshooting Convergence issues: Center/scale variables, remove correlated variables Singular matrix errors: Remove redundant variables, combine sparse factor levels ## References Kuk, D. and Varadhan, R. (2013). Model selection in competing risks regression. Statistics in Medicine. Fine, J.P. and Gray, R.J. (1999). A proportional hazards model for the subdistribution of a competing risk. Journal of the American Statistical Association, 94(446), 496-509. ## Session Info ```{r} sessionInfo() ```