Skip to content

Commit

Permalink
gEBMF not resolved
Browse files Browse the repository at this point in the history
  • Loading branch information
rl3328 committed Sep 30, 2024
1 parent 471b02e commit e200dab
Show file tree
Hide file tree
Showing 6 changed files with 319 additions and 70 deletions.

This file was deleted.

Empty file.

This file was deleted.

This file was deleted.

159 changes: 136 additions & 23 deletions code/data_preprocessing/phenotype/phenotype_imputation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"id": "9073376b-b7e5-46d6-ae70-54229fa72453",
"metadata": {
"kernel": "SoS"
Expand Down Expand Up @@ -211,6 +211,8 @@
" Work directory & output directory\n",
" --phenoFile VAL (as path, required)\n",
" Molecular phenotype matrix\n",
" --[no-]qc-prior-to-impute (default to True)\n",
" QC before imputation to remove unqualified features\n",
" --job-size 1 (as int)\n",
" For cluster jobs, number commands to run per job\n",
" --walltime 72h\n",
Expand Down Expand Up @@ -285,6 +287,8 @@
"parameter: cwd = path(\"output\")\n",
"# Molecular phenotype matrix\n",
"parameter: phenoFile = path\n",
"# QC before imputation to remove unqualified features \n",
"parameter: qc_prior_to_impute = True\n",
"# For cluster jobs, number commands to run per job\n",
"parameter: job_size = 1\n",
"# Wall clock time expected\n",
Expand Down Expand Up @@ -312,7 +316,6 @@
"parameter: prior = \"ebnm_point_laplace\"\n",
"parameter: varType = '1'\n",
"parameter: num_factor = '60'\n",
"parameter: qc_prior_to_impute = True\n",
"input: phenoFile\n",
"output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads \n",
Expand All @@ -327,8 +330,8 @@
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where all values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)]))) == (ncol(pheno) -4))\n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
Expand Down Expand Up @@ -417,7 +420,22 @@
" library(\"softImpute\")\n",
" \n",
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
" if (length(miss.ind) > 0) {\n",
" pheno_qc <- pheno[-miss.ind, ]\n",
" } else {\n",
" pheno_qc <- pheno\n",
" }\n",
" pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
" } else {\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" }\n",
" pheno_range <- range(unlist(as.list(as.data.frame(pheno_NAs))), na.rm = TRUE) \n",
" if (pheno_range[1] >= 0 && pheno_range[2] <= 1) {\n",
" dat <- qnorm(as.matrix(pheno_NAs))\n",
Expand Down Expand Up @@ -452,7 +470,7 @@
" pheno_imp <- x_imp\n",
" }\n",
" \n",
" write_delim(cbind(pheno[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n",
" write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n",
"\n",
"bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" bgzip -f ${_output:n}\n",
Expand Down Expand Up @@ -488,11 +506,28 @@
" library(\"tibble\")\n",
" library(\"readr\")\n",
" library(\"dplyr\")\n",
" \n",
" library(\"missForest\")\n",
"\n",
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'variables')$ximp\n",
" write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
" if (length(miss.ind) > 0) {\n",
" pheno_qc <- pheno[-miss.ind, ]\n",
" } else {\n",
" pheno_qc <- pheno\n",
" }\n",
" pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
" } else {\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" }\n",
" pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'no')$ximp\n",
" # If ’variables’ the data is split into pieces of the size equal to the number of cores registered in the parallel backend.\n",
" write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
"\n",
"bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" bgzip -f ${_output:n}\n",
Expand Down Expand Up @@ -524,15 +559,31 @@
"output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}' \n",
"R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n",
" source('/mnt/vast/hpc/csg/zq2209/xgb_imp.R')\n",
" source('./xgb_imp.R')\n",
" library(\"tibble\")\n",
" library(\"readr\")\n",
" library(\"dplyr\")\n",
" \n",
" library(\"missForest\")\n",
"\n",
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
" if (length(miss.ind) > 0) {\n",
" pheno_qc <- pheno[-miss.ind, ]\n",
" } else {\n",
" pheno_qc <- pheno\n",
" }\n",
" pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
" } else {\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" }\n",
" pheno_imp <- xgboost_imputation(as.matrix(pheno_NAs))\n",
" write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
" write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
"\n",
"bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" bgzip -f ${_output:n}\n",
Expand Down Expand Up @@ -569,9 +620,26 @@
" library(\"dplyr\")\n",
" \n",
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
" if (length(miss.ind) > 0) {\n",
" pheno_qc <- pheno[-miss.ind, ]\n",
" } else {\n",
" pheno_qc <- pheno\n",
" }\n",
" pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
" } else {\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" }\n",
" pheno_NAs = pheno_NAs %>% select(where(~mean(is.na(.)) < 0.8))\n",
" # I removed columns that have more than 80 % missing values, cuz knn cannot impute them. \n",
" pheno_imp <- impute.knn(as.matrix(pheno_NAs), rowmax = 1)$data\n",
" write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
" write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
"\n",
"bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" bgzip -f ${_output:n}\n",
Expand Down Expand Up @@ -608,12 +676,27 @@
" library(\"dplyr\")\n",
" \n",
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
" if (length(miss.ind) > 0) {\n",
" pheno_qc <- pheno[-miss.ind, ]\n",
" } else {\n",
" pheno_qc <- pheno\n",
" }\n",
" pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
" } else {\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" }\n",
" X_mis_C <- as(as.matrix(pheno_NAs), \"Incomplete\")\n",
" ###uses \"svd\" algorithm\n",
" fit <- softImpute(X_mis_C,rank = 50,lambda = 30,type = \"svd\")\n",
" pheno_imp <- complete(as.matrix(pheno_NAs),fit)\n",
" write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
" write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
"\n",
"bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" bgzip -f ${_output:n}\n",
Expand Down Expand Up @@ -649,12 +732,27 @@
" library(\"dplyr\")\n",
" \n",
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
" if (length(miss.ind) > 0) {\n",
" pheno_qc <- pheno[-miss.ind, ]\n",
" } else {\n",
" pheno_qc <- pheno\n",
" }\n",
" pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
" } else {\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" }\n",
" Yfill <- as.matrix(pheno_NAs)\n",
" for (t.row in 1:nrow(pheno_NAs)) {\n",
" Yfill[t.row, is.na(Yfill[t.row,])] <- rowMeans(Yfill, na.rm = TRUE)[t.row] \n",
" } \n",
" write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n",
" write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n",
"\n",
"bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" bgzip -f ${_output:n}\n",
Expand Down Expand Up @@ -690,12 +788,27 @@
" library(\"dplyr\")\n",
" \n",
" pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" # Get index of features that have > 40% missing \n",
" miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
" # Get index of features where more than 95% values are zero.\n",
" value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
" miss.ind <- c(miss.ind, value0.ind)\n",
" # remove these rows if qc_prior_to_impute = True\n",
" if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
" if (length(miss.ind) > 0) {\n",
" pheno_qc <- pheno[-miss.ind, ]\n",
" } else {\n",
" pheno_qc <- pheno\n",
" }\n",
" pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
" } else {\n",
" pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
" }\n",
" Yfill <- as.matrix(pheno_NAs)\n",
" for (t.row in 1:nrow(pheno_NAs)) {\n",
" Yfill[t.row, is.na(Yfill[t.row,])] <- min(Yfill[t.row, ], na.rm = TRUE)\n",
" }\n",
" write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n",
" write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n",
"\n",
"bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" bgzip -f ${_output:n}\n",
Expand Down Expand Up @@ -732,7 +845,7 @@
"# with missing rate smaller than tol_missing will be mean_imputed. Say if we want to keep rows with less than 5% missing, then we use 0.05 as tol_missing.\n",
"parameter: tol_missing = 0.05\n",
"input: phenoFile\n",
"output: f'{_input:nn}.filter_na.{impute_method}.bed.gz'\n",
"output: f'{cwd:a}/{_input:nn}.filter_na.{impute_method}.bed.gz'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads\n",
"R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
" library(\"dplyr\")\n",
Expand Down
Loading

0 comments on commit e200dab

Please sign in to comment.