## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ---- cache = FALSE, include = FALSE------------------------------------------ ## copied from examples from the "simmer" R package ## after: https://www.enchufa2.es/archives/suggests-and-vignettes.html ## by Iñaki Úcar required <- c("BEDMatrix", "snpStats", "pryr") if (!all(sapply(required, requireNamespace, quietly = TRUE))) knitr::opts_chunk$set(eval = FALSE) ## ----------------------------------------------------------------------------- # Data dimensions. # Choose non-multiples of 4 to test edge cases of BED parsers. # Number of loci. m_loci <- 10001 # Number of individuals n_ind <- 1001 # Overall allele frequency # (we don't do any actual genetics in this vignette, # so just set to a reasonable value) p <- 0.5 # Missingness rate miss <- 0.1 # Total number of genotypes n_data <- n_ind * m_loci # Draw random genotypes from Binomial X <- rbinom( n_data, 2, p) # Add missing values X[ sample(n_data, n_data * miss) ] <- NA # Turn into matrix X <- matrix(X, nrow = m_loci, ncol = n_ind) # Inspect the first 10 individuals at the first 10 loci X[1:10, 1:10] ## ----------------------------------------------------------------------------- library(genio) ## ----------------------------------------------------------------------------- # We have to specify the number of loci bim <- make_bim( n = m_loci ) # Inspect the default values bim # Let's add the "chr" prefix to the chromosome values, # so we recognize them when we see them later. bim$chr <- paste0('chr', bim$chr) # Make SNP IDs look like "rs" IDs bim$id <- paste0('rs', bim$id) # Make positions 1000 bigger bim$pos <- bim$pos * 1000 # Select randomly between Cs and Gs for the reference alleles bim$ref <- sample(c('C', 'G'), m_loci, replace = TRUE) # Select randomly between As and Ts for the alternative alleles bim$alt <- sample(c('A', 'T'), m_loci, replace = TRUE) # Inspect the table with our changes bim ## ----------------------------------------------------------------------------- # Specify the number of individuals fam <- make_fam( n = n_ind ) # Inspect the default values fam # Add prefixes to families and IDs to recognize them later fam$fam <- paste0('fam', fam$fam) fam$id <- paste0('id', fam$id) # Sex values are usually 1 and 2 fam$sex <- sample(1:2, n_ind, replace = TRUE) # Let's make phenotypes continuous. # Draw independently from Standard Normal. fam$pheno <- rnorm(n_ind) # Let's leave maternal and paternal IDs as missing # Inspect again fam ## ----------------------------------------------------------------------------- # Add column and row names from bim and fam tables we just created. rownames(X) <- bim$id colnames(X) <- fam$id # Inspect again the first 10 individuals and loci X[1:10, 1:10] ## ----------------------------------------------------------------------------- # Will delete at the end of the vignette file_plink <- tempfile('vignette-random-data') # Write genotypes, along with the BIM and FAM files we created. # Omiting them would result in writing the original dummy version of these tables, # before we edited them. time_write_genio <- system.time( write_plink(file_plink, X, bim, fam) ) time_write_genio ## ----------------------------------------------------------------------------- # Read the data back in memory. # Time this step time_read_genio <- system.time( data_genio <- read_plink(file_plink) ) time_read_genio # Inspect the data we just read back # The same random genotypes (first 10 individuals and loci, now with row and column names): data_genio$X[1:10, 1:10] # The locus annotations data_genio$bim # The individual annotations data_genio$fam # Quickly test that the inputs and outputs are identical. # Genotypes have NAs, so we have to compare this way. stopifnot( all( X == data_genio$X, na.rm = TRUE) ) stopifnot( bim == data_genio$bim ) # FAM has mixed content (chars, ints, and doubles). # First 5 columns should be exact match: stopifnot( fam[,1:5] == data_genio$fam[,1:5] ) # Exact equality may fail for pheno due to precisions, so test this way instead: stopifnot( max(abs(fam$pheno - data_genio$fam$pheno)) < 1e-4 ) ## ----------------------------------------------------------------------------- # Constants bytes_per_genotype <- 4 bytes_per_gb <- 1024 ^ 3 # Example data dimensions num_ind <- 1000 num_loci <- 500000 # Gigabytes per 1000 individuals for a typical genotyping array bytes_per_genotype * num_ind * num_loci / bytes_per_gb ## ----------------------------------------------------------------------------- library(BEDMatrix) # Time it too. # Although the BIM and FAM tables are not returned, # they are partially parsed and kept in memory, # which can take time for extremely large files time_read_bedmatrix_1 <- system.time( X_BEDMatrix <- BEDMatrix(file_plink) ) # Inspect the first 10 loci and individuals as usual. # Note it is transposed compared to our X. # Also note the column and row names are different from genio's. X_BEDMatrix[1:10, 1:10] ## ----------------------------------------------------------------------------- # This turns it into a regular R matrix. # Since most of the reading is actually happening now, # we time this step now. time_read_bedmatrix_2 <- system.time( X_BEDMatrix_Rmat <- as.matrix(X_BEDMatrix) ) time_read_bedmatrix_2 # Now we can test that the BEDMatrix agrees with the original matrix we simulated. # Need to transpose first. stopifnot( all( X == t(X_BEDMatrix_Rmat), na.rm = TRUE) ) ## ----------------------------------------------------------------------------- library(snpStats) # Read data, time it. time_read_snpStats_1 <- system.time( data_snpStats <- read.plink(file_plink) ) time_read_snpStats_1 # Inspect the data # Genotypes. # Note data is not visualized this way. # This matrix is also transposed compared to the genio matrix. data_snpStats$genotypes # Locus annotations head( data_snpStats$map ) # Individual annotations head (data_snpStats$fam ) ## ----------------------------------------------------------------------------- # Transpose, then convert to a regular R matrix. # Let's time this step too. time_read_snpStats_2 <- system.time( X_snpStats <- as( t(data_snpStats$genotypes), 'numeric') ) time_read_snpStats_2 # Now we can visualize the matrix. # First 10 loci and individuals, as before. # Note that, compared to (genio, BEDMatrix), alleles are encoded in reverse, # so 0s and 2s are flipped in this matrix. X_snpStats[1:10, 1:10] # Again verify that the matrices are identical. # (Here 2-X flips back 0s and 2s) stopifnot( all( X == 2 - X_snpStats, na.rm = TRUE) ) ## ----------------------------------------------------------------------------- # Let's write this to another file file_plink_copy <- tempfile('vignette-random-data-copy') # Copy objects to not change originals X_snpStats <- X bim_snpStats <- as.data.frame(bim) # to use rownames fam_snpStats <- as.data.frame(fam) # ditto # All data requires matching row and/or column names. # These first two were already done above. #rownames(X_snpStats) <- bim$id #colnames(X_snpStats) <- fam$id # Row names here are redundant but required. rownames(bim_snpStats) <- bim$id rownames(fam_snpStats) <- fam$id # We shall time several required steps in order to write genotypes in a standard R matrix, # and the related annotation tables, to BED. time_write_snpStats <- system.time({ # Transpose and convert our genotypes to SnpMatrix object. # We flip 0s and 2s before converting X_snpStats <- as(2 - t(X_snpStats), 'SnpMatrix') # This complicated command is required to write the data. # Although X, fam, and bim are passed as for genio's write_plink, # here the name of every column must be specified (there are no reasonable defaults). # Interestingly, the parameter names of snpStats' write.plink # do not agree with read.plink from the same package. write.plink( file_plink_copy, snps = X_snpStats, subject.data = fam_snpStats, pedigree = fam, id = id, father = pat, mother = mat, sex = sex, phenotype = pheno, snp.data = bim_snpStats, chromosome = chr, genetic.distance = posg, position = pos, allele.1 = ref, allele.2 = alt ) }) # remove the new file, no longer need it delete_files_plink(file_plink_copy) ## ---- fig.width = 6, fig.height = 4, fig.align = 'center'--------------------- # Extract component 3 of each time object, # which is is total time elapsed. # Sum the two steps it takes for each of BEDMatrix and snpStats to obtain a native R matrix. times_read <- c( time_read_genio[3], time_read_bedmatrix_1[3] + time_read_bedmatrix_2[3], time_read_snpStats_1[3] + time_read_snpStats_2[3] ) names_read <- c( 'genio', 'BEDMatrix', 'snpStats' ) # Create barplot barplot( times_read, names.arg = names_read, main = 'BED reader runtimes', xlab = 'packages', ylab = 'runtime (s)' ) ## ---- fig.width = 4, fig.height = 4, fig.align = 'center'--------------------- times_write <- c( time_write_genio[3], time_write_snpStats[3] ) names_write <- c( 'genio', 'snpStats' ) # Create barplot barplot( times_write, names.arg = names_write, main = 'BED writer runtimes', xlab = 'packages', ylab = 'runtime (s)' ) ## ---- fig.width = 6, fig.height = 4, fig.align = 'center'--------------------- library(lobstr) # Store directly into a vector sizes <- c( obj_size( X ), obj_size( data_genio$X ), obj_size( X_BEDMatrix ), obj_size( data_snpStats$genotypes ) ) names_sizes <- c( 'original', 'genio', 'BEDMatrix', 'snpStats' ) # Create barplot barplot( as.numeric( sizes ), names.arg = names_sizes, main = 'Native genotype object sizes', xlab = 'packages', ylab = 'memory (bytes)' ) ## ----------------------------------------------------------------------------- delete_files_plink(file_plink) ## ----------------------------------------------------------------------------- sessionInfo()