diff --git a/DESCRIPTION b/DESCRIPTION index e1f3dc38..881744ff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: infercnv Type: Package Title: Infer Copy Number Variation from Single-Cell RNA-Seq Data -Version: 0.8.1 -Date: 2017-05-25 +Version: 0.8.2 +Date: 2018-11-08 Authors@R: c( person("Timothy", "Tickle", email = "ttickle@broadinstitute.org", role = c("aut", "cre")), person("Itay", "Tirosh", email = "tirosh@broadinstitute.org", role = "aut"), person("Christophe", "Georgescu", email = "cgeorges@broadinstitute.org", role = "aut"), person("Maxwell", "Brown", email = "mbrown@broadinstitute.org", role = "aut"), person("Brian", "Haas", email = "bhaas@broadinstitute.org", role = "aut")) Author: Timothy Tickle [aut, cre], Itay Tirosh [aut], Christophe Georgescu [aut], Maxwell Brown [aut], Brian Haas [aut] Maintainer: Christophe Georgescu diff --git a/NAMESPACE b/NAMESPACE index fb196667..18265d54 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,6 +37,8 @@ import(RColorBrewer) import(coin) import(futile.logger) importFrom(Matrix,Matrix) +importFrom(Matrix,colSums) +importFrom(Matrix,rowMeans) importFrom(ape,as.phylo) importFrom(ape,write.tree) importFrom(binhf,ansc) diff --git a/R/NextGenHeatMap.R b/R/NextGenHeatMap.R index 926ecd46..fae86640 100644 --- a/R/NextGenHeatMap.R +++ b/R/NextGenHeatMap.R @@ -84,21 +84,13 @@ Create_NGCHM <- function(infercnv_obj, } ## set variables - reference_idx = row.names(plot_data[unlist(infercnv_obj@reference_grouped_cell_indices),]) ref_index = infercnv_obj@reference_grouped_cell_indices - ref_groups = names(infercnv_obj@reference_grouped_cell_indices) + reference_idx = row.names(plot_data[unlist(ref_index),]) + ref_groups = names(ref_index) # ---------------------- Import Dendrogram & Order Rows ----------------------------------------------------------------------------------- # IF Cluster By Group is set to TRUE: # Get the order of the rows (cell lines) from the dendrogram created by infer_cnv - # - - ## import and read the dendrogram for the observed data created using the ape library - #den_path <- paste(out_dir, "observations_dendrogram.txt", sep=.Platform$file.sep) - #phylo <- ape::read.tree(file = den_path) - # if multiphylo trees, need to iterate to get to labels - #obs_order <- rev(unlist(lapply(1:length(phylo), function(x) phylo[[x]]$tip.label))) # vector holding cell line order taken from the dendrogram - # read the file containing the groupings created by infer_cnv row_groups_path <- paste(out_dir, "observation_groupings.txt", sep=.Platform$file.sep) @@ -213,75 +205,63 @@ Create_NGCHM <- function(infercnv_obj, display = "visible", thickness = as.integer(20)) - # Covariate bar for annotation groups + # Covariate to identify Reference and Observed data annotation_col <- as.character(unlist(row_groups["Annotation.Color"])) # group colors annotation_group <- as.character(unlist(row_groups["Annotation.Group"]))# group number names(annotation_group) <- cells names(annotation_col) <- cells - annotation_palette <- get_group_color_palette()(length(unique(annotation_group))) annotation_unique_group <- unique(annotation_group) - ## create color mapping - colMap_annotation <- NGCHM::chmNewColorMap(values = as.vector(annotation_unique_group), # row names are the cells - colors = annotation_palette, - missing.color = "white") - annotation_cov <- NGCHM::chmNewCovariate(fullname = 'Annotation', - values = annotation_group, - value.properties = colMap_annotation, - type = "discrete") - hm <- NGCHM::chmAddCovariateBar(hm, "row", annotation_cov, - display = "visible", - thickness = as.integer(20)) - # Covariate to identify Reference and Observed data - cell_type <- replace(row_order, 1:length(row_order) %in% unlist(infercnv_obj@observation_grouped_cell_indices), paste("Observed")) + len <-lengths(ref_index) + ref_bar_labels <- unlist(sapply(1:length(len), function(x){ rep(ref_groups[x],len[x]) })) + names(ref_bar_labels) <- reference_idx - ref_groups = names(infercnv_obj@reference_grouped_cell_indices) + # if you want the exact coloring as the original inferCNV plots + #annotation_palette <- c(get_group_color_palette()(length(ref_index)), get_group_color_palette()(length(annotation_unique_group))) - ## Label the references based on index locations - if (length(ref_groups) > 1) { - for(i in 1:length(ref_groups)){ - cell_type <- replace(cell_type, infercnv_obj@reference_grouped_cell_indices[[i]], paste("Reference",toString(i),sep = "")) - } - } else { - for(i in 1:length(ref_groups)){ - cell_type <- replace(cell_type, 1:length(cell_type) %in% infercnv_obj@reference_grouped_cell_indices[[1]], paste("Reference")) - } - } - # make a new variable for later use that has the cell type and cell ID as the name - ## cell ID's need to map to cell types - names(cell_type) <- row_order + # combine reference and observed labels + annotation_group <- c(ref_bar_labels,annotation_group) + + # change the observed group names in bar to group namnes + observed_data <- infercnv_obj@observation_grouped_cell_indices + lapply(1:length(observed_data), function(x) { + tmp <- names(observed_data[x]) + annotation_group <<- replace(annotation_group, observed_data[[x]], tmp) } ) + unique_group <- unique(annotation_group) + annotation_palette <- get_group_color_palette()(length(unique_group)) - # check if all reference cells are in cell type - if (!(all(reference_idx %in% names(cell_type)))){ - missing_refs <- reference_idx[which(!(reference_idx %in% names(cell_type)))] + # check if all reference cells are included + if (!(all(reference_idx %in% names(annotation_group)))){ + missing_refs <- reference_idx[which(!(reference_idx %in% names(annotation_group)))] error_message <- paste("Error: Not all references are accounted for.", "Make sure the reference names match the names in the data.\n", "Check the following reference cell lines: ", paste(missing_refs, collapse = ",")) stop(error_message) } - if (!is.null(cell_type)){ - ## unique group names - types <- unique(cell_type) - ## create colors for groups - type_palette <- get_group_color_palette()(length(types)) - names(type_palette) <- types - - colMap_type <- NGCHM::chmNewColorMap(values = types, - names = types, - colors = type_palette, - missing.color = "white", - type = "linear") - - type_cov <- NGCHM::chmNewCovariate(fullname = 'Cell Type', - values = cell_type, - value.properties = colMap_type, - type = "discrete") - hm <- NGCHM::chmAddCovariateBar(hm, "row", type_cov, - display = "visible", - thickness = as.integer(20)) + # check if all observed cells are included + observed_idx <- row.names(plot_data[unlist(infercnv_obj@observation_grouped_cell_indices),]) + if (!(all(observed_idx %in% names(annotation_group)))){ + missing_obs <- reference_idx[which(!(observed_idx %in% names(annotation_group)))] + error_message <- paste("Error: Not all observed cell lines are accounted for.", + "Make sure the reference names match the names in the data.\n", + "Check the following reference cell lines: ", + paste(missing_obs, collapse = ",")) + stop(error_message) } + ## create color mapping + colMap_annotation <- NGCHM::chmNewColorMap(values = as.vector(unique_group), + colors = annotation_palette, + missing.color = "white") + annotation_cov <- NGCHM::chmNewCovariate(fullname = 'Annotation', + values = annotation_group, + value.properties = colMap_annotation, + type = "discrete") + hm <- NGCHM::chmAddCovariateBar(hm, "row", annotation_cov, + display = "visible", + thickness = as.integer(20)) + #---------------------------------------Export the heat map----------------------------------------------------------------------------------------------------------------------- ## adjust the size of the heat map #hm@width <- as.integer(500) diff --git a/R/inferCNV_constants.R b/R/inferCNV_constants.R index ec0caa0e..543ff7e1 100755 --- a/R/inferCNV_constants.R +++ b/R/inferCNV_constants.R @@ -20,7 +20,7 @@ C_OUTPUT_FORMAT <- c("pdf", "png") #' @importFrom ape write.tree as.phylo #' @importFrom fastcluster hclust #' @import RColorBrewer -#' @importFrom Matrix Matrix +#' @importFrom Matrix Matrix rowMeans colSums #' @import coin #' @importFrom dplyr %>% count diff --git a/R/inferCNV_heatmap.R b/R/inferCNV_heatmap.R index af5771bb..93ef2fb8 100755 --- a/R/inferCNV_heatmap.R +++ b/R/inferCNV_heatmap.R @@ -563,7 +563,7 @@ plot_cnv <- function(infercnv_obj, observation_file_base, sep=" ")) row.names(obs_data) <- orig_row_names - write.table(obs_data[data_observations$rowInd,data_observations$colInd], + write.table(t(obs_data[data_observations$rowInd,data_observations$colInd]), file=observation_file_base) } } diff --git a/R/inferCNV_ops.R b/R/inferCNV_ops.R index 894c4bff..d841e211 100755 --- a/R/inferCNV_ops.R +++ b/R/inferCNV_ops.R @@ -66,6 +66,10 @@ #' #' @param include.spike If true, introduces an artificial spike-in of data at ~0x and 2x for scaling residuals between 0-2. (default: F) #' +#' @param spike_in_chrs vector listing of chr names to use for modeling spike-ins (default: NULL - uses the two largest chrs. ex. c('chr1', 'chr2') ) +#' +#' @param spike_in_multiplier vector of weights matching spike_in_chrs (default: c(0.01, 2.0) for modeling loss/gain of both chrs) +#' #' @param pseudocount Number of counts to add to each gene of each cell post-filtering of genes and cells and pre-total sum count normalization. (default: 0) #' #' @param debug If true, output debug level logging. @@ -107,7 +111,7 @@ run <- function(infercnv_obj, use_zscores=FALSE, remove_genes_at_chr_ends=FALSE, - mask_nonDE_genes=TRUE, + mask_nonDE_genes=FALSE, mask_nonDE_pval=0.05, test.use='wilcoxon', @@ -116,7 +120,11 @@ run <- function(infercnv_obj, debug=FALSE, #for debug level logging include.spike = FALSE, - + + # must specify both below if to be used, and must match in vec length + spike_in_chrs = NULL, # use defaults + spike_in_multiplier_vec = NULL, # use defaults + pseudocount = 0 ) { @@ -202,10 +210,14 @@ run <- function(infercnv_obj, if (include.spike) { step_count = step_count + 1 flog.info(sprintf("\n\n\tSTEP %02d: Spiking in genes with variation added for tracking\n", step_count)) + + if (! (is.null(spike_in_chrs) && is.null(spike_in_multiplier_vec)) ) { + infercnv_obj <- spike_in_variation_chrs(infercnv_obj, spike_in_chrs, spike_in_multiplier_vec) + } else { + infercnv_obj <- spike_in_variation_chrs(infercnv_obj) + } - infercnv_obj <- spike_in_variation_chrs(infercnv_obj) - - # Plot incremental steps. + # Plot incremental steps. if (plot_steps){ infercnv_obj_spiked <- infercnv_obj @@ -657,9 +669,6 @@ run <- function(infercnv_obj, output_filename=sprintf("infercnv.%02d_scaled_by_spike", step_count)) } - # remove the spike now - infercnv_obj <- remove_spike(infercnv_obj) - } @@ -697,6 +706,12 @@ run <- function(infercnv_obj, } } + if (include.spike) { + # remove the spike before making the final plot. + infercnv_obj <- remove_spike(infercnv_obj) + } + + save('infercnv_obj', file=file.path(out_dir, "run.final.infercnv_obj")) flog.info("Making the final infercnv heatmap") @@ -1143,7 +1158,7 @@ center_cell_expr_across_chromosome <- function(infercnv_obj, method="mean") { # #' @title require_above_min_mean_expr_cutoff () #' -#' @description Filters out genes that have fewer than the corresponding mean value across the reference cell values. +#' @description Filters out genes that have fewer than the corresponding mean value across all cell values. #' #' @param infercnv_obj infercnv_object #' @@ -1158,10 +1173,8 @@ require_above_min_mean_expr_cutoff <- function(infercnv_obj, min_mean_expr_cutof flog.info(paste("::above_min_mean_expr_cutoff:Start", sep="")) - # restrict to reference cells: - ref_cells_data <- infercnv_obj@expr.data[ , get_reference_grouped_cell_indices(infercnv_obj) ] - indices <-.below_min_mean_expr_cutoff(ref_cells_data, min_mean_expr_cutoff) + indices <-.below_min_mean_expr_cutoff(infercnv_obj@expr.data, min_mean_expr_cutoff) if (length(indices) > 0) { flog.info(sprintf("Removing %d genes from matrix as below mean expr threshold: %g", length(indices), min_mean_expr_cutoff)) @@ -1195,7 +1208,7 @@ require_above_min_mean_expr_cutoff <- function(infercnv_obj, min_mean_expr_cutof #' @title require_above_min_cells_ref() #' -#' @description Filters out genes that have fewer than specified number of reference cells expressing them. +#' @description Filters out genes that have fewer than specified number of cells expressing them. #' #' @param infercnv_obj infercnv_object #' @@ -1207,15 +1220,11 @@ require_above_min_mean_expr_cutoff <- function(infercnv_obj, min_mean_expr_cutof #' require_above_min_cells_ref <- function(infercnv_obj, min_cells_per_gene) { - - ref_cell_indices = get_reference_grouped_cell_indices(infercnv_obj) - ref_data = infercnv_obj@expr.data[,ref_cell_indices] - - ref_genes_passed = which(apply(ref_data, 1, function(x) { sum(x>0 & ! is.na(x)) >= min_cells_per_gene})) + genes_passed = which(apply(infercnv_obj@expr.data, 1, function(x) { sum(x>0 & ! is.na(x)) >= min_cells_per_gene})) - num_genes_total = dim(ref_data)[1] - num_removed = num_genes_total - length(ref_genes_passed) + num_genes_total = dim(infercnv_obj@expr.data)[1] + num_removed = num_genes_total - length(genes_passed) if (num_removed > 0) { flog.info(sprintf("Removed %d genes having fewer than %d min cells per gene = %g %% genes removed here", @@ -1229,7 +1238,7 @@ require_above_min_cells_ref <- function(infercnv_obj, min_cells_per_gene) { } - infercnv_obj <- remove_genes(infercnv_obj, -1 * ref_genes_passed) + infercnv_obj <- remove_genes(infercnv_obj, -1 * genes_passed) } @@ -1904,7 +1913,9 @@ anscombe_transform <- function(infercnv_obj) { } - +#' @keywords internal +#' @noRd +#' add_pseudocount <- function(infercnv_obj, pseudocount) { flog.info(sprintf("Adding pseudocount: %g", pseudocount)) diff --git a/R/inferCNV_spike.R b/R/inferCNV_spike.R index 4431703c..b50889c1 100644 --- a/R/inferCNV_spike.R +++ b/R/inferCNV_spike.R @@ -79,18 +79,22 @@ spike_in_variation_chrs <- function(infercnv_obj, normal_cells_idx = infercnv::get_reference_grouped_cell_indices(infercnv_obj) normal_cells_expr = infercnv_obj@expr.data[,normal_cells_idx] + # zeros are a problem here... + gene_means = rowMeans(normal_cells_expr) + + mean_p0_table = .get_mean_vs_p0_table(infercnv_obj) + ## apply spike-in multiplier vec for (i in 1:length(spike_in_multiplier_vec)) { gene_indices = gene_selection_listing[[i]] multiplier = spike_in_multiplier_vec[i] - - normal_cells_expr[gene_indices, ] = normal_cells_expr[gene_indices, ] * multiplier - - } + gene_means[gene_indices] = gene_means[gene_indices] * multiplier + } + ## get simulated matrix - sim_matrix = .get_simulated_cell_matrix(mvtable, normal_cells_expr, max_cells) + sim_matrix = .get_simulated_cell_matrix(gene_means, mean_p0_table, max_cells) ## integrate into expr data and count data matrices ncol_begin = ncol(infercnv_obj@expr.data) + 1 @@ -114,11 +118,9 @@ spike_in_variation_chrs <- function(infercnv_obj, ##' the mean/variance relationship for all genes in all cell groupings. ##' ##' Cells are simulated as so: -##' A random cell is selected from the normal cell expression matrix. -##' The expression of each gene is treated as a targeted mean expression value. -##' The variance is chosen based on a spline fit to the mean/variance relationship provided. -##' A random expression value is generated from a normal distribution with corresponding (mean, variance) -##' +##' The mean for genes in the normal cells are computed +##' A random expression value is chosen for each gene using a negative binomial distribution with dispersion = 0.1 +##' ##' Genes are named according to the input expression matrix, and cells are named 'spike_{number}'. ##' ##' @param mean_var_table : a data.frame containing three columns: group_name, mean, variance of expression per gene per grouping. @@ -133,38 +135,58 @@ spike_in_variation_chrs <- function(infercnv_obj, ##' @noRd ##' -.get_simulated_cell_matrix <- function(mean_var_table, normal_cell_expr, num_cells) { +.get_simulated_cell_matrix <- function(gene_means, mean_p0_table, num_cells) { # should be working on the total sum count normalized data. # model the mean variance relationship + + ngenes = length(gene_means) + + dropout_logistic_params <- .get_logistic_params(mean_p0_table) - s = smooth.spline(log2(mean_var_table$m+1), log2(mean_var_table$v+1)) - spike_cell_names = paste0("spike_", 1:num_cells) - - ngenes = nrow(normal_cell_expr) + spike_cell_names = paste0('spike_cell_', 1:num_cells) - sim_expr_val <- function(gene_idx, rand_cell_idx) { - m = normal_cell_expr[gene_idx, rand_cell_idx] - v = predict(s, log2(m+1))$y - v = max(0, 2^v-1) - val = max(0, rnorm(n=1, mean=m, sd=sqrt(v))) - return(val) - } - sim_cell_matrix = matrix(rep(0,ngenes*num_cells), nrow=ngenes) - rownames(sim_cell_matrix) = rownames(normal_cell_expr) + rownames(sim_cell_matrix) = names(gene_means) colnames(sim_cell_matrix) = spike_cell_names + + sim_expr_vals <- function(gene_idx) { + m = gene_means[gene_idx] + return(.sim_expr_val(m, dropout_logistic_params)) + } for (i in 1:num_cells) { - rand_cell_idx = floor(runif(1) * ncol(normal_cell_expr)+1) - newvals = sapply(1:ngenes, FUN=sim_expr_val, rand_cell_idx) + newvals = sapply(1:ngenes, FUN=sim_expr_vals) sim_cell_matrix[,i] = newvals } return(sim_cell_matrix) } + +##' @keywords internal +##' @noRd +##' + +.sim_expr_val <- function(m, dropout_logistic_params) { + # include drop-out prediction + + val = 0 + if (m > 0) { + dropout_prob <- .logistic(x=log(m), midpt=dropout_logistic_params$midpt, slope=dropout_logistic_params$slope) + + if (runif(1) > dropout_prob) { + # not a drop-out + val = rnbinom(n=1, mu=m, size=1/0.1) #fixed dispersion at 0.1 + } + } + return(val) +} + + + + ##' .get_mean_var_table() ##' ##' Computes the gene mean/variance table based on all defined cell groupings (reference and observations) @@ -199,7 +221,7 @@ spike_in_variation_chrs <- function(infercnv_obj, return(mean_var_table) } -##' get_spike_in_average_bounds() +##' .get_spike_in_average_bounds() ##' ##' return mean bounds for expression of all cells in the spike-in ##' @@ -292,7 +314,11 @@ scale_cnv_by_spike <- function(infercnv_obj) { } -# selects the specified number of chrs having the largest number of (expressed) genes +#' selects the specified number of chrs having the largest number of (expressed) genes +#' @keywords internal +#' @noRd +#' + .select_longest_chrs <- function(infercnv_obj, num_chrs_want) { # get count of chrs @@ -301,3 +327,105 @@ scale_cnv_by_spike <- function(infercnv_obj) { return(counts$chr[1:num_chrs_want]) } + +#' Computes probability of seeing a zero expr val as a function of the mean gene expression +#' The p(0 | mean_expr) is computed separately for each sample grouping. +#' +#' @keywords internal +#' @noRd +#' + +.get_mean_vs_p0_table <- function(infercnv_obj) { + + group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) + + mean_p0_table = NULL + + for (group_name in names(group_indices)) { + flog.info(sprintf("processing group: %s", group_name)) + expr.data = infercnv_obj@expr.data[, group_indices[[ group_name ]] ] + + group_mean_p0_table <- .get_mean_vs_p0_from_matrix(expr.data) + group_mean_p0_table[[ 'group_name' ]] <- group_name + + if (is.null(mean_p0_table)) { + mean_p0_table = group_mean_p0_table + } else { + mean_p0_table = rbind(mean_p0_table, group_mean_p0_table) + } + } + + return(mean_p0_table) +} + +#' Computes probability of seeing a zero expr val as a function of the mean gene expression +#' based on the input expression matrix. +#' +#' @keywords internal +#' @noRd +#' + +.get_mean_vs_p0_from_matrix <- function(expr.data) { + ncells = ncol(expr.data) + m = rowMeans(expr.data) + numZeros = apply(expr.data, 1, function(x) { sum(x==0) }) + + pZero = numZeros/ncells + + mean_p0_table = data.frame(m=m, p0=pZero) + + return(mean_p0_table) +} + + +#' +#' Logistic function +#' +#' InferCNV note: Standard function here, but lifted from +#' Splatter (Zappia, Phipson, and Oshlack, 2017) +#' https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1305-0 +#' +#' Implementation of the logistic function +#' +#' @param x value to apply the function to. +#' @param x0 midpoint parameter. Gives the centre of the function. +#' @param k shape parameter. Gives the slope of the function. +#' +#' @return Value of logistic function with given parameters +#' +#' @keywords internal +#' @noRd +#' +.logistic <- function(x, midpt, slope) { + 1 / (1 + exp(-slope * (x - midpt))) +} + + + +#' Given the mean, p0 table, fits the data to a logistic function to compute +#' the shape of the logistic distribution. +#' +#' @keywords internal +#' @noRd +#' + +.get_logistic_params <- function(mean_p0_table) { + + mean_p0_table <- mean_p0_table[mean_p0_table$m > 0, ] # remove zeros, can't take log. + + x = log(mean_p0_table$m) + y = mean_p0_table$p0 + + df = data.frame(x,y) + + #write.table(df, "_logistic_params", quote=F, sep="\t") # debugging... + + fit <- nls(y ~ .logistic(x, midpt = x0, slope = k), data = df, start = list(x0 = mean(x), k = -1)) # borrowed/updated from splatter + + logistic_params = list() + + logistic_params[[ 'midpt' ]] <- summary(fit)$coefficients["x0", "Estimate"] + logistic_params[[ 'slope' ]] <- summary(fit)$coefficients["k", "Estimate"] + + return(logistic_params) +} diff --git a/example/example.Rmd b/example/example.Rmd index 1e6b4c71..503ae642 100644 --- a/example/example.Rmd +++ b/example/example.Rmd @@ -276,7 +276,6 @@ knitr::include_graphics("infercnv.non-DE-genes-masked.png") ``` -<<<<<<< HEAD ## Brighten it up by changing the scale threshold to our liking: ```{r} diff --git a/example/example.html.REMOVED.git-id b/example/example.html.REMOVED.git-id index faef62c8..9f69f952 100644 --- a/example/example.html.REMOVED.git-id +++ b/example/example.html.REMOVED.git-id @@ -1 +1 @@ -ac0f2f6b6c6f94a77b88b0ca90565069e056017b \ No newline at end of file +33ba46b4f974bd0d8faff86a7e4a4aae112b6a8b \ No newline at end of file diff --git a/example/run.R b/example/run.R index 903a2a85..9b96fea2 100755 --- a/example/run.R +++ b/example/run.R @@ -16,6 +16,7 @@ infercnv_obj = infercnv::run(infercnv_obj, out_dir=out_dir, cluster_by_groups=T, plot_steps=F, + mask_nonDE_genes = T, include.spike=T # used for final scaling to fit range (0,2) centered at 1. ) diff --git a/man/ngchm.Rd b/man/ngchm.Rd index c9bb9981..acf41a9c 100644 --- a/man/ngchm.Rd +++ b/man/ngchm.Rd @@ -17,6 +17,10 @@ ngchm(infercnv_obj, out_dir = ".", title = "NGCHM", gene_symbol = NULL, \item{gene_symbol}{##TODO (default: NULL)} \item{path_to_shaidyMapGen}{path to the shaidyMapGen jar file (default: NULL)} + +\item{x.range}{(integer) Values for minimum and maximum thresholds for heatmap coloring.} + +\item{x.center}{(integer) Center expression value for heatmap coloring.} } \description{ Function for Generating a next-generation heatmap diff --git a/man/require_above_min_cells_ref.Rd b/man/require_above_min_cells_ref.Rd index 7e0a1647..b6463a0c 100644 --- a/man/require_above_min_cells_ref.Rd +++ b/man/require_above_min_cells_ref.Rd @@ -15,5 +15,5 @@ require_above_min_cells_ref(infercnv_obj, min_cells_per_gene) infercnv_obj infercnv_object with corresponding genes removed. } \description{ -Filters out genes that have fewer than specified number of reference cells expressing them. +Filters out genes that have fewer than specified number of cells expressing them. } diff --git a/man/require_above_min_mean_expr_cutoff.Rd b/man/require_above_min_mean_expr_cutoff.Rd index 20412803..fba460be 100644 --- a/man/require_above_min_mean_expr_cutoff.Rd +++ b/man/require_above_min_mean_expr_cutoff.Rd @@ -15,5 +15,5 @@ require_above_min_mean_expr_cutoff(infercnv_obj, min_mean_expr_cutoff) infercnv_obj the infercnv_object with lowly or unexpressed genes removed. } \description{ -Filters out genes that have fewer than the corresponding mean value across the reference cell values. +Filters out genes that have fewer than the corresponding mean value across all cell values. } diff --git a/man/run.Rd b/man/run.Rd index c4ada1ce..9b4e8bc6 100644 --- a/man/run.Rd +++ b/man/run.Rd @@ -11,9 +11,10 @@ run(infercnv_obj, cutoff = 1, min_cells_per_gene = 3, out_dir = ".", outlier_method_bound = "average_bound", outlier_lower_bound = NA, outlier_upper_bound = NA, hclust_method = "complete", anscombe_normalize = TRUE, use_zscores = FALSE, - remove_genes_at_chr_ends = FALSE, mask_nonDE_genes = TRUE, + remove_genes_at_chr_ends = FALSE, mask_nonDE_genes = FALSE, mask_nonDE_pval = 0.05, test.use = "wilcoxon", plot_steps = FALSE, - debug = FALSE, include.spike = FALSE, pseudocount = 0) + debug = FALSE, include.spike = FALSE, spike_in_chrs = NULL, + spike_in_multiplier_vec = NULL, pseudocount = 0) } \arguments{ \item{infercnv_obj}{An infercnv object populated with raw count data} @@ -79,7 +80,11 @@ the mean value for the complete data set} \item{include.spike}{If true, introduces an artificial spike-in of data at ~0x and 2x for scaling residuals between 0-2. (default: F)} +\item{spike_in_chrs}{vector listing of chr names to use for modeling spike-ins (default: NULL - uses the two largest chrs. ex. c('chr1', 'chr2') )} + \item{pseudocount}{Number of counts to add to each gene of each cell post-filtering of genes and cells and pre-total sum count normalization. (default: 0)} + +\item{spike_in_multiplier}{vector of weights matching spike_in_chrs (default: c(0.01, 2.0) for modeling loss/gain of both chrs)} } \value{ infercnv_obj containing filtered and transformed data diff --git a/scripts/examine_infercnv_data_params.R b/scripts/examine_infercnv_data_params.R new file mode 100755 index 00000000..ddd8ccb3 --- /dev/null +++ b/scripts/examine_infercnv_data_params.R @@ -0,0 +1,82 @@ +#!/usr/bin/env Rscript + +args<-commandArgs(TRUE) + +if (length(args) == 0) { + stop("Error, require params: infercnv.obj"); +} + +infercnv_file_obj = args[1] + +load(infercnv_file_obj) + + +infercnv_name_obj = grep("infercnv_obj", ls(), value=T)[1] + +print(infercnv_name_obj) + +infercnv_obj = get(infercnv_name_obj) + + +library(edgeR) +library(fitdistrplus) +library(infercnv) + +# borrowing some code from splatter + +get_parameters <- function(group_name, expr.matrix) { + + params = list() + params[['group_name']] = group_name + + # estimate gamma for genes + lib.sizes <- colSums(expr.matrix) + lib.med <- median(lib.sizes) + norm.counts <- t(t(expr.matrix) / lib.sizes * lib.med) + norm.counts <- norm.counts[rowSums(norm.counts > 0) > 1, ] + + means <- rowMeans(norm.counts) + means.fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") + mean.shape = unname(means.fit$estimate["shape"]) + mean.rate = unname(means.fit$estimate["rate"]) + + params[[ 'gamma.mean.shape' ]] = mean.shape + params[[ 'gamma.mean.rate' ]] = mean.rate + + + # estimate dropout params + mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix) + logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table) + + params[['dropout.logistic.midpt']] = logistic_params$midpt + params[['dropout.logistic.slope']] = logistic_params$slope + + + # estimate common dispersion + design <- matrix(1, ncol(expr.matrix), 1) + disps <- edgeR::estimateDisp(expr.matrix, design = design) + + params[[ 'common.dispersion' ]] = disps$common.dispersion + + + return(params) + +} + + + +# examine each group +all_groups = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) + +for (group in names(all_groups)) { + + group_idxs = all_groups[[ group ]] + expr.data = infercnv_obj@expr.data[, group_idxs] + + params = get_parameters(group, expr.data) + params = t(as.data.frame(params)) + + print(params) + +} +