Merge pull request #174 from dianacornejo/master

Nice updates
cumc · Nov 1, 2023 · ecbcc26 · ecbcc26
2 parents eedd7a6 + 1dad058
commit ecbcc26
Showing 1 changed file with 32 additions and 14 deletions.
diff --git a/variant-annotation/vep.ipynb b/variant-annotation/vep.ipynb
@@ -7,7 +7,7 @@
  "kernel": "SoS"
  },
  "source": [
- "# VEP annotation of variants in VCF format using plugins"
+ "# VEP annotation of rare variants using plugins"
  ]
  },
  {
@@ -132,6 +132,8 @@
  "kernel": "SoS"
  },
  "source": [
+ "Use `output_vcf` if you want vcf output (default option) or `--no-output-vcf` if you want tsv output.\n",
+ "\n",
  "```\n",
  "sos run vep.ipynb \\\n",
  "--cwd output \\\n",
@@ -140,10 +142,13 @@
  "--conservation_file data/vep/loftee.sql \\\n",
  "--gerp_bigwig data/vep/gerp_conservation_scores.homo_sapiens.GRCh38.bw \\\n",
  "--cadd_snps data/cadd/whole_genome_SNVs_inclAnno.tsv.gz \\\n",
- "--cad_indels data/cadd/gnomad.genomes.r3.0.indel.tsv.gz \\\n",
+ "--cadd_indels data/cadd/gnomad.genomes.r3.0.indel.tsv.gz \\\n",
+ "--clinvar_db data/clinvar/clinvar_20231028.vcf.gz \\\n",
+ "--cache_version 103 \\\n",
  "--dir_cache data/vep \\\n",
  "--walltime 30h \\\n",
- "--mem 30G\n",
+ "--mem 30G \\\n",
+ "--no-output-vcf\n",
  "```\n"
  ]
  },
@@ -156,7 +161,7 @@
  "source": [
  "## Output file\n",
  "\n",
- "The output file will be formatted as VCF"
+ "The output file will be formatted as VCF or TSV depending on the option you choose"
  ]
  },
  {
@@ -227,6 +232,8 @@
  "parameter: assembly = 'GRCh38'\n",
  "# Cache dir\n",
  "parameter: dir_cache = path\n",
+ "# Generate vcf output\n",
+ "parameter: output_vcf = True\n",
  "# For cluster jobs, number commands to run per job\n",
  "parameter: job_size = 1\n",
  "parameter: mem = '15G'\n",
@@ -246,14 +253,14 @@
  "source": [
  "[default_1]\n",
  "input: vcf\n",
- "output: f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.tsv.gz' \n",
+ "output: f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.vcf.gz' if output_vcf else f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.tsv.gz'\n",
  "parameter: vep_window = 10000\n",
  "parameter: most_severe = False\n",
  "parameter: everything = True\n",
- "bash: container=container, entrypoint=\"micromamba run -a '' -n rare_variation\", expand= \"${ }\", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout'\n",
+ "bash: container=container, entrypoint=\"micromamba run -a '' -n rare_variation\", expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n",
  " vep \\\n",
  " --verbose \\\n",
- " --tab \\\n",
+ " ${('--vcf') if output_vcf else '--tab'} \\\n",
  " -i ${_input} \\\n",
  " -o ${_output:n} \\\n",
  " --distance ${vep_window} \\\n",
@@ -269,7 +276,7 @@
  " ${('--custom file='+ str(clinvar_db)+',short_name=ClinVar,format=vcf,type=exact,coords=0,fields=CLNSIG%CLNREVSTAT%CLNDN') if clinvar_db else ''} \\\n",
  " --plugin LoF,human_ancestor_fa:${human_ancestor},loftee_path:$VEP_PLUGIN_DIR,conservation_file:${conservation_file},gerp_bigwig:${gerp_bigwig} \\\n",
  " --plugin CADD,snv=${cadd_snps},indels=${cadd_indels}\n",
- " bgzip ${_output:n}\n",
+ " bgzip  ${_output:n}\n",
  " tabix ${_output}"
  ]
  },
@@ -293,15 +300,16 @@
  "outputs": [],
  "source": [
  "[default_2]\n",
+ "stop_if(output_vcf)\n",
  "output: f'{cwd}/{_input:bnn}.formatted.tsv.gz'\n",
- "python: expand= \"${ }\", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout'\n",
- " pandas as pd\n",
+ "python: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n",
+ " import pandas as pd\n",
  " import gzip\n",
  " # Initialize an empty list to store the comments\n",
  " comments = []\n",
  "\n",
  " # Open the gzip-compressed VCF file and read it line by line\n",
- " with gzip.open(${_input}, 'rt') as file:\n",
+ " with gzip.open('${_input}', 'rt') as file:\n",
  " for line in file:\n",
  " if line.startswith('#'):\n",
  " comments.append(line.strip())\n",
@@ -314,9 +322,19 @@
  " header = header_comment.split('#')[1]\n",
  " # Split the header into column names\n",
  " columns = header.split('\\t')\n",
- " df = pd.read_csv(${_input}, compression='gzip', comment='#', sep='\\t', names=columns)\n",
- " df.to_csv(${_output}, index=False, sep='\\t', header=True, compression={'method': 'gzip', 'compresslevel': 9})"
+ " df = pd.read_csv('${_input}', compression='gzip', comment='#', sep='\\t', names=columns)\n",
+ " df.to_csv('${_output}', index=False, sep='\\t',header=True, compression={'method': 'gzip', 'compresslevel': 9})"
  ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47f12c92-5c4c-43d8-ab07-3124b2d59fba",
+ "metadata": {
+ "kernel": "SoS"
+ },
+ "outputs": [],
+ "source": []
  }
  ],
  "metadata": {
@@ -342,7 +360,7 @@
  ""
  ]
  ],
- "version": "0.20.1"
+ "version": "0.24.3"
  }
  },
  "nbformat": 4,