cumc · gaow · Oct 1, 2024 · Sep 30, 2024 · Sep 30, 2024 · Oct 1, 2024
diff --git a/code/data_preprocessing/phenotype/phenotype_imputation.ipynb b/code/data_preprocessing/phenotype/phenotype_imputation.ipynb
@@ -178,7 +178,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
  "id": "9073376b-b7e5-46d6-ae70-54229fa72453",
  "metadata": {
  "kernel": "SoS"
@@ -211,6 +211,8 @@
  " Work directory & output directory\n",
  " --phenoFile VAL (as path, required)\n",
  " Molecular phenotype matrix\n",
+ " --[no-]qc-prior-to-impute (default to True)\n",
+ " QC before imputation to remove unqualified features\n",
  " --job-size 1 (as int)\n",
  " For cluster jobs, number commands to run per job\n",
  " --walltime 72h\n",
@@ -285,6 +287,8 @@
  "parameter: cwd = path(\"output\")\n",
  "# Molecular phenotype matrix\n",
  "parameter: phenoFile = path\n",
+ "# QC before imputation to remove unqualified features \n",
+ "parameter: qc_prior_to_impute = True\n",
  "# For cluster jobs, number commands to run per job\n",
  "parameter: job_size = 1\n",
  "# Wall clock time expected\n",
@@ -312,7 +316,6 @@
  "parameter: prior = \"ebnm_point_laplace\"\n",
  "parameter: varType = '1'\n",
  "parameter: num_factor = '60'\n",
- "parameter: qc_prior_to_impute = True\n",
  "input: phenoFile\n",
  "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
  "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads \n",
@@ -327,8 +330,8 @@
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
  " # Get index of features that have > 40% missing \n",
  " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
- " # Get index of features where all values are zero.\n",
- " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)]))) == (ncol(pheno) -4))\n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
  " miss.ind <- c(miss.ind, value0.ind)\n",
  " # remove these rows if qc_prior_to_impute = True\n",
  " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
@@ -417,7 +420,22 @@
  " library(\"softImpute\")\n",
  " \n",
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
- " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " # Get index of features that have > 40% missing \n",
+ " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+ " miss.ind <- c(miss.ind, value0.ind)\n",
+ " # remove these rows if qc_prior_to_impute = True\n",
+ " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+ " if (length(miss.ind) > 0) {\n",
+ " pheno_qc <- pheno[-miss.ind, ]\n",
+ " } else {\n",
+ " pheno_qc <- pheno\n",
+ " }\n",
+ " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+ " } else {\n",
+ " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " }\n",
  " pheno_range <- range(unlist(as.list(as.data.frame(pheno_NAs))), na.rm = TRUE) \n",
  " if (pheno_range[1] >= 0 && pheno_range[2] <= 1) {\n",
  " dat <- qnorm(as.matrix(pheno_NAs))\n",
@@ -452,7 +470,7 @@
  " pheno_imp <- x_imp\n",
  " }\n",
  " \n",
- " write_delim(cbind(pheno[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n",
+ " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n",
  "\n",
  "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " bgzip -f ${_output:n}\n",
@@ -488,11 +506,28 @@
  " library(\"tibble\")\n",
  " library(\"readr\")\n",
  " library(\"dplyr\")\n",
- " \n",
+ " library(\"missForest\")\n",
+ "\n",
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
- " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
- " pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'variables')$ximp\n",
- " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+ " # Get index of features that have > 40% missing \n",
+ " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+ " miss.ind <- c(miss.ind, value0.ind)\n",
+ " # remove these rows if qc_prior_to_impute = True\n",
+ " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+ " if (length(miss.ind) > 0) {\n",
+ " pheno_qc <- pheno[-miss.ind, ]\n",
+ " } else {\n",
+ " pheno_qc <- pheno\n",
+ " }\n",
+ " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+ " } else {\n",
+ " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " }\n",
+ " pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'no')$ximp\n",
+ " # If ’variables’ the data is split into pieces of the size equal to the number of cores registered in the parallel backend.\n",
+ " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
  "\n",
  "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " bgzip -f ${_output:n}\n",
@@ -524,15 +559,31 @@
  "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
  "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}' \n",
  "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n",
- " source('/mnt/vast/hpc/csg/zq2209/xgb_imp.R')\n",
+ " source('./xgb_imp.R')\n",
  " library(\"tibble\")\n",
  " library(\"readr\")\n",
  " library(\"dplyr\")\n",
- " \n",
+ " library(\"missForest\")\n",
+ "\n",
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
- " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " # Get index of features that have > 40% missing \n",
+ " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+ " miss.ind <- c(miss.ind, value0.ind)\n",
+ " # remove these rows if qc_prior_to_impute = True\n",
+ " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+ " if (length(miss.ind) > 0) {\n",
+ " pheno_qc <- pheno[-miss.ind, ]\n",
+ " } else {\n",
+ " pheno_qc <- pheno\n",
+ " }\n",
+ " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+ " } else {\n",
+ " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " }\n",
  " pheno_imp <- xgboost_imputation(as.matrix(pheno_NAs))\n",
- " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+ " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
  "\n",
  "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " bgzip -f ${_output:n}\n",
@@ -569,9 +620,26 @@
  " library(\"dplyr\")\n",
  " \n",
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
- " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " # Get index of features that have > 40% missing \n",
+ " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+ " miss.ind <- c(miss.ind, value0.ind)\n",
+ " # remove these rows if qc_prior_to_impute = True\n",
+ " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+ " if (length(miss.ind) > 0) {\n",
+ " pheno_qc <- pheno[-miss.ind, ]\n",
+ " } else {\n",
+ " pheno_qc <- pheno\n",
+ " }\n",
+ " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+ " } else {\n",
+ " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " }\n",
+ " pheno_NAs = pheno_NAs %>% select(where(~mean(is.na(.)) < 0.8))\n",
+ " # I removed columns that have more than 80 % missing values, cuz knn cannot impute them. \n",
  " pheno_imp <- impute.knn(as.matrix(pheno_NAs), rowmax = 1)$data\n",
- " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+ " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
  "\n",
  "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " bgzip -f ${_output:n}\n",
@@ -608,12 +676,27 @@
  " library(\"dplyr\")\n",
  " \n",
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
- " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " # Get index of features that have > 40% missing \n",
+ " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+ " miss.ind <- c(miss.ind, value0.ind)\n",
+ " # remove these rows if qc_prior_to_impute = True\n",
+ " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+ " if (length(miss.ind) > 0) {\n",
+ " pheno_qc <- pheno[-miss.ind, ]\n",
+ " } else {\n",
+ " pheno_qc <- pheno\n",
+ " }\n",
+ " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+ " } else {\n",
+ " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " }\n",
  " X_mis_C <- as(as.matrix(pheno_NAs), \"Incomplete\")\n",
  " ###uses \"svd\" algorithm\n",
  " fit <- softImpute(X_mis_C,rank = 50,lambda = 30,type = \"svd\")\n",
  " pheno_imp <- complete(as.matrix(pheno_NAs),fit)\n",
- " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+ " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
  "\n",
  "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " bgzip -f ${_output:n}\n",
@@ -649,12 +732,27 @@
  " library(\"dplyr\")\n",
  " \n",
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
- " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " # Get index of features that have > 40% missing \n",
+ " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+ " miss.ind <- c(miss.ind, value0.ind)\n",
+ " # remove these rows if qc_prior_to_impute = True\n",
+ " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+ " if (length(miss.ind) > 0) {\n",
+ " pheno_qc <- pheno[-miss.ind, ]\n",
+ " } else {\n",
+ " pheno_qc <- pheno\n",
+ " }\n",
+ " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+ " } else {\n",
+ " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " }\n",
  " Yfill <- as.matrix(pheno_NAs)\n",
  " for (t.row in 1:nrow(pheno_NAs)) {\n",
  " Yfill[t.row, is.na(Yfill[t.row,])] <- rowMeans(Yfill, na.rm = TRUE)[t.row] \n",
  " } \n",
- " write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n",
+ " write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n",
  "\n",
  "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " bgzip -f ${_output:n}\n",
@@ -690,12 +788,27 @@
  " library(\"dplyr\")\n",
  " \n",
  " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
- " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " # Get index of features that have > 40% missing \n",
+ " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+ " # Get index of features where more than 95% values are zero.\n",
+ " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+ " miss.ind <- c(miss.ind, value0.ind)\n",
+ " # remove these rows if qc_prior_to_impute = True\n",
+ " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+ " if (length(miss.ind) > 0) {\n",
+ " pheno_qc <- pheno[-miss.ind, ]\n",
+ " } else {\n",
+ " pheno_qc <- pheno\n",
+ " }\n",
+ " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+ " } else {\n",
+ " pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+ " }\n",
  " Yfill <- as.matrix(pheno_NAs)\n",
  " for (t.row in 1:nrow(pheno_NAs)) {\n",
  " Yfill[t.row, is.na(Yfill[t.row,])] <- min(Yfill[t.row, ], na.rm = TRUE)\n",
  " }\n",
- " write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n",
+ " write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n",
  "\n",
  "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " bgzip -f ${_output:n}\n",
@@ -732,7 +845,7 @@
  "# with missing rate smaller than tol_missing will be mean_imputed. Say if we want to keep rows with less than 5% missing, then we use 0.05 as tol_missing.\n",
  "parameter: tol_missing = 0.05\n",
  "input: phenoFile\n",
- "output: f'{_input:nn}.filter_na.{impute_method}.bed.gz'\n",
+ "output: f'{cwd:a}/{_input:nn}.filter_na.{impute_method}.bed.gz'\n",
  "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads\n",
  "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
  " library(\"dplyr\")\n",