From f9c2588a499dfd3b4eff7871cdb3ec415e511bd4 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Fri, 29 Apr 2022 16:43:52 -0400 Subject: [PATCH] fix path issue --- GWAS/Region_Extraction.ipynb | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/GWAS/Region_Extraction.ipynb b/GWAS/Region_Extraction.ipynb index 9fb3530..578d1c9 100644 --- a/GWAS/Region_Extraction.ipynb +++ b/GWAS/Region_Extraction.ipynb @@ -48,7 +48,7 @@ "Make sure you install the pre-requisited before running this notebook:\n", "\n", "```\n", - "pip install LDtoolsets\n", + "pip install cugg\n", "```" ] }, @@ -71,7 +71,7 @@ "- `--pheno-path`, the path of a phenotype. Only for one genotype data. If `None`, only `pld` will be calculated.\n", " - The phenotype file should have a column with the name `IID`, which is used to represent the sample ID.\n", "- `--sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n", - " - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n", + " - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n", "- `--unrelated-samples`, the file path of unrelated samples with a column named `IID`. If `None`, all samples will be considered unrelative. \n", "- `--cwd`, the path of output directory\n", "\n", @@ -81,7 +81,7 @@ " - The first column is chromosome ID, the 2nd file is genotype for that chromosome.\n", " - When chromosome ID is 0, it implies that the genotype file contains all the genotypes.\n", "- `--imp-sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n", - " - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n", + " - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n", "- `--imp-ref`, the reference genome if exome genotype and imputed genotype are different. If `None`, The two genotype data will be considered from the same " ] }, @@ -155,12 +155,10 @@ "parameter: geno_path = path\n", "# Phenotype path\n", "parameter: pheno_path = path\n", - "# Sample file path, for bgen format\n", - "parameter: bgen_sample_path = path('.')\n", "# Path to summary stats file\n", "parameter: sumstats_path = path\n", - "# Path to summary stats format configuration\n", - "parameter: format_config_path = path('.')\n", + "# Sample file path, for bgen format\n", + "parameter: bgen_sample_path = path()\n", "# Path to samples of unrelated individuals\n", "parameter: unrelated_samples = path()\n", "# imputed Genotype file inventory\n", @@ -170,7 +168,7 @@ "# Number of tasks to run in each job on cluster\n", "parameter: job_size = int\n", "# The reference genome of imputed genotype data\n", - "parameter: imp_ref = str\n", + "parameter: imp_ref = str('')\n", "parameter: walltime = '12h'\n", "parameter: mem = '60G'\n", "fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')\n", @@ -189,18 +187,18 @@ "outputs": [], "source": [ "[default_1 (export utils script)]\n", - "depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('LDtools')\n", + "depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('cugg')\n", "parameter: scan_window = 500000\n", "output: f'{cwd:a}/utils.py'\n", "report: expand = '${ }', output=f'{cwd:a}/utils.py'\n", " import pandas as pd\n", " import numpy as np\n", " import dask.array as da\n", - " from LDtools.liftover import *\n", - " from LDtools.genodata import *\n", - " from LDtools.sumstat import *\n", - " from LDtools.ldmatrix import *\n", - " from LDtools.utils import *\n", + " from cugg.liftover import *\n", + " from cugg.genodata import *\n", + " from cugg.sumstat import *\n", + " from cugg.ldmatrix import *\n", + " from cugg.utils import *\n", "\n", "\n", " def main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD,bgen_sample_path):\n", @@ -344,16 +342,24 @@ " imp_sumstats_path = ${_input[5]:r}\n", " bgen_sample_path = ${_input[6]:r}\n", " imp_ref = '${imp_ref}'\n", + "\n", + " if not imp_ref:\n", + " imp_ref=None\n", + "\n", + " if not os.path.isfile(bgen_sample_path):\n", + " bgen_sample_path=None\n", + " print('If the genotype data is bgen format, please provide the path of bgen sample')\n", " \n", " input_format_config = ${format_config_path:r} if ${format_config_path.is_file()} else None\n", "\n", " chrom = \"${_regions[0]}\"\n", " # Load genotype file for the region of interest\n", " geno_inventory = dict([x.strip().split() for x in open(input_geno_path).readlines() if x.strip()])\n", - " if yml_path.is_file(): \n", + " if os.path.isfile(imp_geno_path): \n", " imp_geno_inventory = dict([x.strip().split() for x in open(imp_geno_path).readlines() if x.strip()])\n", " else:\n", " imp_geno_inventory={'0':None,chrom:None}\n", + " imp_sumstats_path = None\n", " \n", " if chrom.startswith('chr'):\n", " chrom = chrom[3:]\n",