Allow for using subset of variants for CV

cumc · Feb 4, 2024 · 1d6014e · 1d6014e
1 parent e0e50bb
commit 1d6014e
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 11 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Encoding: UTF-8
 Type: Package
 Package: pecotmr
-Version: 0.1.26
-Date: 2024-01-31
+Version: 0.1.27
+Date: 2024-02-03
 Title: Pair-wise enrichment, colocalization, TWAS and Mendelian Randomization to integrate molecular QTL and GWAS.
 Description: The majority of the statistical models in pecotmr are based on fine-mapped single effects described in Wang G et al (2020) JRSS-B. It also incoporates several other TWAS methods as well as utility functions for QTL and GWAS integration.
 URL: https://github.com/cumc/pecotmr

diff --git a/R/encoloc.R b/R/encoloc.R
@@ -110,12 +110,6 @@ calculate_purity <- function(variants, ext_ld, squared) {
  purity
 }
 
-#' Function to convert region df to str
-convert_to_string <- function(df) {
- result <- paste0("chr", df$chrom, ":", df$start, "-", df$end)
- return(result)
-}
-
 #' Main processing function
 #' This function is designed to summarize coloc results based on the following criteria:
 #' 1. Among the colocalized variant pairs, PPH4 has the highest value compared to PPH0-PPH3.
@@ -225,6 +219,7 @@ process_coloc_results <- function(coloc_result, LD_meta_file_path,analysis_scrip
 coloc_wrapper <- function(xqtl_file, gwas_files, xqtl_condition = NULL, gwas_condition = NULL, 
  finemapping_obj = "susie_result_trimmed", varname_obj = "variant_names", xqtl_region_obj = "region_info",
  prior_tol = 1e-9, p1=1e-4, p2=1e-4, p12=5e-6, ...) {
+
  # Load and process GWAS data
  gwas_lbf_matrices <- lapply(gwas_files, function(file) {
  raw_data <- readRDS(file)
@@ -275,8 +270,9 @@ coloc_wrapper <- function(xqtl_file, gwas_files, xqtl_condition = NULL, gwas_con
  # Report the number of dropped columns from xQTL matrix
  num_dropped_cols <- length(setdiff(colnames(xqtl_lbf_matrix), common_colnames))
  message("Number of columns dropped from xQTL matrix: ", num_dropped_cols)
-
-
+
+ # Function to convert region df to str
+ convert_to_string <- function(df) paste0("chr", df$chrom, ":", df$start, "-", df$end)
  region <- if (!is.null(xqtl_region_obj)) get_nested_element(xqtl_raw_data, xqtl_region_obj)$region %>% convert_to_string else NULL
 
  # COLOC function 

diff --git a/R/twas.R b/R/twas.R
@@ -40,6 +40,7 @@ twas_z <- function(weights, z, R=NULL, X=NULL) {
 #' @param weight_methods A list of methods and their specific arguments, formatted as list(method1 = method1_args, method2 = method2_args). 
 #' methods in the list can be either univariate (applied to each column of Y) or multivariate (applied to the entire Y matrix).
 #' @param seed An optional integer to set the random seed for reproducibility of sample splitting.
+#' @param max_num_variants An optional integer to set the randomly selected maximum number of variants to use for CV purpose, to save computing time.
 #' @param num_threads The number of threads to use for parallel processing.
 #' If set to -1, the function uses all available cores.
 #' If set to 0 or 1, no parallel processing is performed.
@@ -65,7 +66,7 @@ twas_z <- function(weights, z, R=NULL, X=NULL) {
 #' @importFrom foreach %dopar%
 #' @importFrom doParallel registerDoParallel
 #' @export
-twas_weights_cv <- function(X, Y, fold = NULL, sample_partitions = NULL, weight_methods = NULL, seed = NULL, num_threads = 1, ...) {
+twas_weights_cv <- function(X, Y, fold = NULL, sample_partitions = NULL, weight_methods = NULL, seed = NULL, max_num_variants = NULL, num_threads = 1, ...) {
  split_data <- function(X, Y, sample_partition, fold){
  if (is.null(rownames(X))) {
  warning("Row names in X are missing. Using row indices.")
@@ -111,6 +112,13 @@ twas_weights_cv <- function(X, Y, fold = NULL, sample_partitions = NULL, weight_
  } else {
  sample_names <- 1:nrow(X)
  }
+
+ # Select variants if necessary
+ if (!is.null(max_num_variants) && ncol(X)> max_num_variants) {
+ selected_columns <- sort(sample(ncol(X), max_num_variants, replace = FALSE))
+ message(paste("Randomly selecting", ncol(X[, selected_columns]), "out of", ncol(X), "variants for cross validation purpose."))
+ X <- X[, selected_columns]
+ }
 
  arg <-list(...)
 

diff --git a/man/twas_weights_cv.Rd b/man/twas_weights_cv.Rd