## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----eval=FALSE--------------------------------------------------------------- # # The simulator # install.packages("remotes") # remotes::install_github("tgbrooks/dependentsimr") # # # DESeq2 - optional dependency only needed if doing RNA-seq data sets # # Needed for type = "DESeq2" # install.packages("BiocManager") # BiocManager::install("DESeq2") # # # Two more dependencies that are used in some simulation modes # # Needed for method = "corpcor" # install.packages("corpcor") # # # Needed for method = "spiked Wishart" # install.packages("sparsesvd") # # # Extras needed for this vignette # install.packages("tidyverse") ## ----------------------------------------------------------------------------- library(dependentsimr) suppressWarnings(suppressMessages(library(tidyverse, DESeq2))) ## ----------------------------------------------------------------------------- set.seed(0) ## ----------------------------------------------------------------------------- head(read_counts) ## ----------------------------------------------------------------------------- count_matrix <- as.matrix(read_counts[,2:ncol(read_counts)]) rownames(count_matrix) <- read_counts$gene_id ## ----------------------------------------------------------------------------- rs_pca <- get_random_structure(list(data=count_matrix), method="pca", rank=2, types="DESeq2") ## ----------------------------------------------------------------------------- rs_wishart <- get_random_structure(list(data=count_matrix), rank=11, method="spiked Wishart", types="DESeq2") ## ----------------------------------------------------------------------------- rs_corpcor <- get_random_structure(list(data=count_matrix), method="corpcor", types="DESeq2") ## ----------------------------------------------------------------------------- rs_indep <- remove_dependence(rs_pca) ## ----------------------------------------------------------------------------- actual_library_sizes <- count_matrix |> apply(2, sum) N_SAMPLES <- 6 library_sizes <- sample(actual_library_sizes / mean(actual_library_sizes), size=N_SAMPLES, replace=TRUE) ## ----------------------------------------------------------------------------- control_sim_data <- draw_from_multivariate_corr(rs_pca, n_samples=N_SAMPLES, size_factors=library_sizes)$data ## ----------------------------------------------------------------------------- # fraction of all genes to modify in treatment FRAC_DE <- 0.1 # Min and max value for the log fold change (LFC) for each DE gene chosen # non-DE genes will have a LFC of 0 LFC_MIN <- 0.5 LFC_MAX <- 4.0 N_DE <- floor(FRAC_DE * nrow(control_sim_data)) # Pick which genes are DE de_genes <- sample(nrow(control_sim_data), N_DE) # Pick the LFCs of each of those genes de_lfc <- runif(N_DE, LFC_MIN, LFC_MAX) * sample(c(-1, 1), size=N_DE, replace=TRUE) ## ----------------------------------------------------------------------------- rs_treatment <- rs_pca rs_treatment$marginals$data$q[de_genes] <- 2^(de_lfc) * rs_treatment$marginals$data$q[de_genes] ## ----------------------------------------------------------------------------- treatment_sim_data <- draw_from_multivariate_corr(rs_treatment, n_samples=N_SAMPLES, size_factors=library_sizes)$data ## ----------------------------------------------------------------------------- sim_data <- cbind(control_sim_data, treatment_sim_data) pca <- prcomp(t(sim_data), center=TRUE, scale.=TRUE, rank.=2) pca_points <- predict(pca, t(sim_data)) pca_data <- tibble(x=pca_points[,1], y=pca_points[,2]) |> mutate(group = c(rep("control", N_SAMPLES), rep("treatment", N_SAMPLES))) ggplot(pca_data, aes(x, y, color=group)) + geom_point() + labs(x = "PC1", y = "PC2")