Skip to content

Commit

Permalink
Merge pull request #266 from al4225/main
Browse files Browse the repository at this point in the history
quantile_twas_weight, fix singular matrix in rq, add heter calculation
  • Loading branch information
gaow authored Oct 8, 2024
2 parents 9ea8957 + f343b38 commit 5f0ad7d
Show file tree
Hide file tree
Showing 2 changed files with 241 additions and 54 deletions.
276 changes: 241 additions & 35 deletions R/quantile_twas_weight.R
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ perform_qr_analysis <- function(X, Y, Z = NULL, tau_values = seq(0.05, 0.95, by
}

# Fit the quantile regression model using rq.fit.br
mod <- rq.fit.br(X_design, response, tau = tau)
mod <- suppressWarnings(rq.fit.br(X_design, response, tau = tau))

# Extract the coefficient for the predictor (second coefficient)
predictor_coef <- mod$coefficients[2] # Coefficient for predictor
Expand Down Expand Up @@ -379,47 +379,240 @@ corr_filter <- function(X, cor_thres = 0.8) {
}


#' Calculate QR Coefficients and Pseudo R-squared
#' @param ExprData List containing X, Y, C, and X.filter
#' @param tau.list Vector of quantiles to be analyzed
#' @return A list containing beta matrix as twas weight and pseudo R-squared values
#' @importFrom quantreg rq rq.fit.br
calculate_qr_and_pseudo_R2 <- function(ExprData, tau.list) {
# Fit models for all taus
fit_full <- rq(Y ~ X.filter + C, tau = tau.list, data = ExprData)
fit_intercept <- rq(Y ~ 1, tau = tau.list, data = ExprData)

# Define rho function
rho <- function(u, tau) {
u * (tau - (u < 0))
}

# Prepare to store results
pseudo_R2 <- numeric(length(tau.list))
names(pseudo_R2) <- tau.list
#' Check and Remove Problematic Columns to Ensure Full Rank
#'
#' This function checks for problematic columns in the design matrix that cause it to be not full rank,
#' and iteratively removes them based on the chosen strategy until the matrix is full rank.
#'
#' @param X Matrix of SNPs
#' @param C Matrix of covariates (unnamed)
#' @param strategy The strategy for removing problematic columns ("variance", "correlation", or "response_correlation")
#' @param response Optional response vector for "response_correlation" strategy
#' @param max_iterations Maximum number of iterations to attempt removing problematic columns
#' @return Cleaned matrix X with problematic columns removed
#' @importFrom stats qr
#' @noRd
check_remove_highcorr_snp <- function(X, C, strategy = c("correlation", "variance", "response_correlation"), response = NULL, max_iterations = 100) {
strategy <- match.arg(strategy)
# Combine the design matrix with X (SNPs) and C (covariates), keeping C without column names
X_design <- cbind(1, X, C) # Add an intercept column (1)
colnames_X_design <- c("Intercept", colnames(X)) # Assign column names only to X (SNPs) part

# Assign column names only to the X part, leaving C without names
colnames(X_design)[1:(length(colnames_X_design))] <- colnames_X_design

# Check the initial rank of the design matrix
matrix_rank <- qr(X_design)$rank
message("Initial rank of the design matrix: ", matrix_rank, " / ", ncol(X_design), " columns.")

iteration <- 0
while (matrix_rank < ncol(X_design) && iteration < max_iterations) {
message("Design matrix is not full rank, identifying problematic columns...")

# QR decomposition to identify linearly dependent columns
qr_decomp <- qr(X_design)
R <- qr_decomp$rank
Q <- qr_decomp$pivot

# Get the problematic columns by indexing from the pivot
problematic_cols <- Q[(R + 1):ncol(X_design)]
problematic_colnames <- colnames(X_design)[problematic_cols] # Map the indices to column names

# Limit the columns to be removed to SNPs only, not covariates
problematic_colnames <- problematic_colnames[problematic_colnames %in% colnames(X)]

if (length(problematic_colnames) == 0) {
message("No more problematic SNP columns found in X. Breaking the loop.")
break
}

# Print the problematic column names for debugging purposes
message("Problematic SNP columns identified: ", paste(problematic_colnames, collapse = ", "))

# Remove problematic columns affecting the rank based on the chosen strategy
X <- remove_highcorr_snp(X, problematic_colnames, strategy = strategy, response = response)
# Rebuild the design matrix with cleaned X, leaving C unnamed
X_design <- cbind(1, X, C)
colnames_X_design <- c("Intercept", colnames(X)) # Reassign column names to X part only

# Only assign names to X part, leaving C unnamed
colnames(X_design)[1:length(colnames_X_design)] <- colnames_X_design

# Recheck the rank of the design matrix
matrix_rank <- qr(X_design)$rank
message("New rank of the design matrix: ", matrix_rank, " / ", ncol(X_design), " columns.")

iteration <- iteration + 1
}

if (iteration == max_iterations) {
warning("Maximum iterations reached. The design matrix may still not be full rank.")
}

return(X) # Return the cleaned X matrix
}

# Calculate pseudo R^2 for each tau
for (i in seq_along(tau.list)) {
tau <- tau.list[i]
#' Remove Problematic Columns Based on a Given Strategy
#'
#' This function removes problematic columns from a matrix based on different strategies, such as smallest variance,
#' highest correlation, or lowest correlation with the response variable.
#'
#' @param X Matrix of SNPs
#' @param problematic_cols A vector of problematic columns to be removed
#' @param strategy The strategy for removing problematic columns ("variance", "correlation", or "response_correlation")
#' @param response Optional response vector for "response_correlation" strategy
#' @return Cleaned matrix X with the selected column removed
#' @importFrom stats var cor
#' @noRd
remove_highcorr_snp <- function(X, problematic_cols, strategy = c("correlation", "variance", "response_correlation"), response = NULL) {
# Set default strategy
strategy <- match.arg(strategy)

message("Identified problematic columns: ", paste(problematic_cols, collapse = ", "))

if (length(problematic_cols) == 0) {
return(X) # If there are no problematic columns, return as is
}

# Get residuals
residuals0 <- residuals(fit_intercept, subset = i)
residuals1 <- residuals(fit_full, subset = i)
if (length(problematic_cols) == 1) {
message("Only one problematic column: ", problematic_cols)
col_to_remove <- problematic_cols[1]
message("Removing column: ", col_to_remove)
X <- X[, !(colnames(X) %in% col_to_remove), drop = FALSE]
return(X)
}

# Calculate and store pseudo R^2
rho0 <- sum(rho(residuals0, tau))
rho1 <- sum(rho(residuals1, tau))
pseudo_R2[i] <- 1 - rho1 / rho0
}
# Choose columns to remove based on the strategy
if (strategy == "variance") {
# Strategy 1: Remove the column with the smallest variance
variances <- apply(X[, problematic_cols, drop = FALSE], 2, var)
col_to_remove <- problematic_cols[which.min(variances)]
message("Removing column with the smallest variance: ", col_to_remove)

} else if (strategy == "correlation") {
# Strategy 2: Remove the column with the highest sum of absolute correlations
cor_matrix <- abs(cor(X[, problematic_cols, drop = FALSE])) # Calculate absolute correlation matrix
diag(cor_matrix) <- 0 # Ignore the diagonal (self-correlation)

if (length(problematic_cols) == 2) {
# If there are only two problematic columns, randomly remove one
col_to_remove <- sample(problematic_cols, 1)
message("Only two problematic columns, randomly removing: ", col_to_remove)
} else {
# Calculate sum of absolute correlations for each column
cor_sums <- colSums(cor_matrix)
col_to_remove <- problematic_cols[which.max(cor_sums)] # Remove the column with the largest sum of correlations
message("Removing column with highest sum of absolute correlations: ", col_to_remove)
}

} else if (strategy == "response_correlation" && !is.null(response)) {
# Strategy 3: Remove the column with the lowest correlation with the response variable
# FIXME: This strategy is potentially biased based on corr of response and variants
cor_with_response <- apply(X[, problematic_cols, drop = FALSE], 2, function(col) cor(col, response))
col_to_remove <- problematic_cols[which.min(abs(cor_with_response))]
message("Removing column with lowest correlation with the response: ", col_to_remove)
} else {
stop("Invalid strategy or missing response variable for 'response_correlation' strategy.")
}

# Remove the selected column from X
X <- X[, !(colnames(X) %in% col_to_remove), drop = FALSE]
return(X)
}

# Extract coefficients of snps, removing intercept if included
num_filter_vars <- ncol(ExprData$X.filter)
beta_mat <- coef(fit_full)[2:(1 + num_filter_vars), , drop = FALSE]
rownames_beta <- rownames(beta_mat)
rownames(beta_mat) <- gsub("^X.filter", "", rownames_beta)
list(beta_mat = beta_mat, pseudo_R2 = pseudo_R2)
#' Calculate QR Coefficients and Pseudo R-squared Across Multiple Quantiles
#'
#' This function calculates quantile regression coefficients and pseudo R-squared values across multiple quantiles,
#' while handling problematic columns that might affect the rank of the design matrix.
#'
#' @param ExprData List containing X, Y, C, and X.filter
#' @param tau.list Vector of quantiles to be analyzed
#' @param strategy The strategy for removing problematic columns ("variance", "correlation", or "response_correlation")
#' @return A list containing the cleaned X matrix, beta matrix as twas weight, and pseudo R-squared values
#' @importFrom quantreg rq rq.fit.br
#' @noRd
calculate_qr_and_pseudo_R2 <- function(ExprData, tau.list, strategy = c("correlation", "variance", "response_correlation")) {
strategy <- match.arg(strategy)
# Check and handle problematic columns affecting the full rank of the design matrix
ExprData$X.filter <- check_remove_highcorr_snp(ExprData$X.filter, ExprData$C, strategy = strategy, response = ExprData$Y)

# Build the cleaned design matrix using the filtered X and unnamed C

# Fit the models for all tau values
message("Start fitting full model for all taus...")
fit_full <- suppressWarnings(rq(Y ~ X.filter + C, tau = tau.list, data = ExprData))
message("Finished fitting full model. Start fitting intercept-only model for all taus...")
fit_intercept <- suppressWarnings(rq(ExprData$Y ~ 1, tau = tau.list, data = ExprData))
message("Finished fitting intercept-only model.")
# Define the rho function for pseudo R² calculation
rho <- function(u, tau) {
u * (tau - (u < 0))
}

# Prepare to store the pseudo R² results
pseudo_R2 <- numeric(length(tau.list))
names(pseudo_R2) <- tau.list

# Calculate pseudo R² for each tau
for (i in seq_along(tau.list)) {
tau <- tau.list[i]

# Get residuals for the intercept-only and full models
residuals0 <- residuals(fit_intercept, subset = i)
residuals1 <- residuals(fit_full, subset = i)

# Calculate and store pseudo R² for each tau
rho0 <- sum(rho(residuals0, tau))
rho1 <- sum(rho(residuals1, tau))
pseudo_R2[i] <- 1 - rho1 / rho0
}

# Extract the coefficients for the SNPs
num_filter_vars <- ncol(ExprData$X.filter)
beta_mat <- coef(fit_full)[2:(1 + num_filter_vars), , drop = FALSE]
rownames_beta <- rownames(beta_mat)
rownames(beta_mat) <- gsub("^X.filter", "", rownames_beta)
return(list(X.filter = ExprData$X.filter, beta_mat = beta_mat, pseudo_R2 = pseudo_R2))
}

#' Calculate Heterogeneity of Beta Coefficients Across Quantiles
#'
#' This function calculates the heterogeneity of beta coefficients across multiple quantiles for each variant_id.
#' Heterogeneity is computed as log(sd(beta) / abs(mean(beta))).
#'
#' @param rq_coef_result Data frame containing variant_id and QR coefficient columns
#' @return A data frame with variant_id and heterogeneity values
#' @noRd
calculate_coef_heterogeneity <- function(rq_coef_result) {
# Identify all the columns starting with "coef_qr_" (quantile regression coefficient columns)
coef_cols <- grep("^coef_qr_", colnames(rq_coef_result), value = TRUE)

# Create a new data frame with variant_id and heterogeneity
heterogeneity_result <- data.frame(
variant_id = rq_coef_result$variant_id,
coef_heter = apply(rq_coef_result[, coef_cols], 1, function(beta) {
# Compute the mean and standard deviation, ignoring NAs
beta_mean <- mean(beta, na.rm = TRUE)
beta_sd <- sd(beta, na.rm = TRUE)

# Handle the case where mean(beta) is 0 to avoid division by zero
if (abs(beta_mean) == 0) {
return(NA) # Return NA if mean is zero
}

# Compute the heterogeneity: log(sd(beta) / abs(mean(beta)))
heterogeneity <- log(beta_sd / abs(beta_mean))
return(heterogeneity)
}),
stringsAsFactors = FALSE
)

# Return only variant_id and heterogeneity
return(heterogeneity_result)
}


#' Quantile TWAS Weight Pipeline
#'
#' @param X Matrix of genotypes
Expand Down Expand Up @@ -462,7 +655,9 @@ quantile_twas_weight_pipeline <- function(X, Y, Z = NULL, maf = NULL, extract_re
quantile_qtl_tau_list = seq(0.05, 0.95, by = 0.05),
quantile_twas_tau_list = seq(0.01, 0.99, by = 0.01)) {
# Step 1: QR screen
message("Starting QR screen for region: ", extract_region_name, " and gene: ", names(fdat$residual_Y)[r])
p.screen <- qr_screen(X = X, Y = Y, Z = Z, tau.list = quantile_qtl_tau_list, threshold = 0.05, method = "qvalue", top_count = 10, top_percent = 15)
message("QR screen completed. Checking for significant SNPs number...")
# Initialize results list
results <- list(qr_screen_pvalue_df = p.screen$df_result)

Expand All @@ -471,6 +666,7 @@ quantile_twas_weight_pipeline <- function(X, Y, Z = NULL, maf = NULL, extract_re
return(results)
}
# Step 2: Filter highly correlated SNPs
message("Filtering highly correlated SNPs...")
if (length(p.screen$sig_SNP_threshold) > 1) {
filtered <- corr_filter(X[, p.screen$sig_SNP_threshold, drop = FALSE], 0.8)
X.filter <- filtered$X.new
Expand All @@ -479,19 +675,29 @@ quantile_twas_weight_pipeline <- function(X, Y, Z = NULL, maf = NULL, extract_re
results$message <- paste0("Only one significant SNP in gene ", extract_region_name, names(fdat$residual_Y)[r], ", skipping correlation filter.")
}
# Step 3: LD clumping and pruning from results of QR_screen
message("Filter highly correlated SNPs completed. Performing LD clumping and pruning from QR screen results...")
LD_SNPs <- multicontext_ld_clumping(X = X[, p.screen$sig_SNP_threshold, drop = FALSE], qr_results = p.screen, maf_list = maf)
x_clumped <- X[, p.screen$sig_SNP_threshold, drop = FALSE][, LD_SNPs$final_SNPs, drop = FALSE]

# Step 4: Only fit marginal QR to get beta with SNPs after LD pruning for quantile_qtl_tau_list values
message("LD clumping and pruning completed. Fitting marginal QR for selected SNPs...")
rq_coef_result <- perform_qr_analysis(X = x_clumped, Y = Y, Z = Z, tau_values = quantile_qtl_tau_list)

# Step 5: Fit QR and get twas weight and R2 for all taus
message("Marginal QR fitting completed. Fitting full QR to calculate TWAS weights and pseudo R-squared values...")
ExprData <- list(X = X, Y = Y, C = Z, X.filter = X.filter)
qr_beta_R2_results <- calculate_qr_and_pseudo_R2(ExprData, quantile_twas_tau_list)
X.filter = qr_beta_R2_results$X.filter

# Step 6: beta_heterogeneity in marginal model
message("TWAS weights and pseudo R-squared calculations completed. Calculating beta heterogeneity...")
beta_heterogeneity <- calculate_coef_heterogeneity(rq_coef_result)
message("Beta heterogeneity calculation completed.")

# Add additional results
results$twas_variant_names <- colnames(X.filter)
results$rq_coef_df <- rq_coef_result
results$beta_heterogeneity <- beta_heterogeneity
results$twas_weight <- qr_beta_R2_results$beta_mat
results$pseudo_R2 <- qr_beta_R2_results$pseudo_R2
results$quantile_twas_prediction <- X.filter %*% results$twas_weight
Expand Down
19 changes: 0 additions & 19 deletions man/calculate_qr_and_pseudo_R2.Rd

This file was deleted.

0 comments on commit 5f0ad7d

Please sign in to comment.