## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library("flevr") ## ----load-biomarker-data------------------------------------------------------ # load the dataset data("biomarkers") library("dplyr") # set up vector "y" of outcomes and matrix "x" of features cc <- complete.cases(biomarkers) y_cc <- biomarkers$mucinous[cc] x_cc <- biomarkers %>% na.omit() %>% select(starts_with("lab"), starts_with("cea")) x_df <- as.data.frame(x_cc) x_names <- names(x_df) ## ----fit-sl------------------------------------------------------------------- set.seed(1234) # fit a Super Learner ensemble; note its simplicity, for speed library("SuperLearner") learners <- c("SL.glm", "SL.ranger.imp", "SL.glmnet") V <- 2 fit <- SuperLearner::SuperLearner(Y = y_cc, X = x_df, SL.library = learners, cvControl = list(V = V), family = "binomial") # check the SL weights fit$coef # extract importance based on the whole Super Learner sl_importance_all <- extract_importance_SL( fit = fit, feature_names = x_names, import_type = "all" ) sl_importance_all ## ----sl-best-alg-------------------------------------------------------------- sl_importance_best <- extract_importance_SL( fit = fit, feature_names = x_names, import_type = "best" ) sl_importance_best ## ----extrinsic-selection------------------------------------------------------ extrinsic_selected <- extrinsic_selection( fit = fit, feature_names = x_names, threshold = 5, import_type = "all" ) extrinsic_selected ## ----impute-setup------------------------------------------------------------- n_imp <- 2 ## ----impute, eval = FALSE----------------------------------------------------- # library("mice") # set.seed(20231121) # mi_biomarkers <- mice::mice(data = biomarkers, m = n_imp, printFlag = FALSE) # imputed_biomarkers <- mice::complete(mi_biomarkers, action = "long") %>% # rename(imp = .imp, id = .id) ## ----extrinsic-selection-with-missing-data, eval = FALSE---------------------- # set.seed(20231121) # # set up a list to collect selected sets # all_selected_vars <- vector("list", length = 5) # # for each imputed dataset, do extrinsic selection # for (i in 1:n_imp) { # # fit a Super Learner # these_data <- imputed_biomarkers %>% # filter(imp == i) # this_y <- these_data$mucinous # this_x <- these_data %>% # select(starts_with("lab"), starts_with("cea")) # this_x_df <- as.data.frame(this_x) # fit <- SuperLearner::SuperLearner(Y = this_y, X = this_x_df, # SL.library = learners, # cvControl = list(V = V), # family = "binomial") # # do extrinsic selection # all_selected_vars[[i]] <- extrinsic_selection( # fit = fit, feature_names = x_names, threshold = 5, import_type = "all" # )$selected # } # # perform extrinsic variable selection # selected_vars <- pool_selected_sets(sets = all_selected_vars, threshold = 1 / n_imp) # x_names[selected_vars]