Skip to content

Commit

Permalink
Merge pull request #174 from dianacornejo/master
Browse files Browse the repository at this point in the history
Nice updates
  • Loading branch information
gaow authored Nov 1, 2023
2 parents eedd7a6 + 1dad058 commit ecbcc26
Showing 1 changed file with 32 additions and 14 deletions.
46 changes: 32 additions & 14 deletions variant-annotation/vep.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"kernel": "SoS"
},
"source": [
"# VEP annotation of variants in VCF format using plugins"
"# VEP annotation of rare variants using plugins"
]
},
{
Expand Down Expand Up @@ -132,6 +132,8 @@
"kernel": "SoS"
},
"source": [
"Use `output_vcf` if you want vcf output (default option) or `--no-output-vcf` if you want tsv output.\n",
"\n",
"```\n",
"sos run vep.ipynb \\\n",
"--cwd output \\\n",
Expand All @@ -140,10 +142,13 @@
"--conservation_file data/vep/loftee.sql \\\n",
"--gerp_bigwig data/vep/gerp_conservation_scores.homo_sapiens.GRCh38.bw \\\n",
"--cadd_snps data/cadd/whole_genome_SNVs_inclAnno.tsv.gz \\\n",
"--cad_indels data/cadd/gnomad.genomes.r3.0.indel.tsv.gz \\\n",
"--cadd_indels data/cadd/gnomad.genomes.r3.0.indel.tsv.gz \\\n",
"--clinvar_db data/clinvar/clinvar_20231028.vcf.gz \\\n",
"--cache_version 103 \\\n",
"--dir_cache data/vep \\\n",
"--walltime 30h \\\n",
"--mem 30G\n",
"--mem 30G \\\n",
"--no-output-vcf\n",
"```\n"
]
},
Expand All @@ -156,7 +161,7 @@
"source": [
"## Output file\n",
"\n",
"The output file will be formatted as VCF"
"The output file will be formatted as VCF or TSV depending on the option you choose"
]
},
{
Expand Down Expand Up @@ -227,6 +232,8 @@
"parameter: assembly = 'GRCh38'\n",
"# Cache dir\n",
"parameter: dir_cache = path\n",
"# Generate vcf output\n",
"parameter: output_vcf = True\n",
"# For cluster jobs, number commands to run per job\n",
"parameter: job_size = 1\n",
"parameter: mem = '15G'\n",
Expand All @@ -246,14 +253,14 @@
"source": [
"[default_1]\n",
"input: vcf\n",
"output: f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.tsv.gz' \n",
"output: f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.vcf.gz' if output_vcf else f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.tsv.gz'\n",
"parameter: vep_window = 10000\n",
"parameter: most_severe = False\n",
"parameter: everything = True\n",
"bash: container=container, entrypoint=\"micromamba run -a '' -n rare_variation\", expand= \"${ }\", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout'\n",
"bash: container=container, entrypoint=\"micromamba run -a '' -n rare_variation\", expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n",
" vep \\\n",
" --verbose \\\n",
" --tab \\\n",
" ${('--vcf') if output_vcf else '--tab'} \\\n",
" -i ${_input} \\\n",
" -o ${_output:n} \\\n",
" --distance ${vep_window} \\\n",
Expand All @@ -269,7 +276,7 @@
" ${('--custom file='+ str(clinvar_db)+',short_name=ClinVar,format=vcf,type=exact,coords=0,fields=CLNSIG%CLNREVSTAT%CLNDN') if clinvar_db else ''} \\\n",
" --plugin LoF,human_ancestor_fa:${human_ancestor},loftee_path:$VEP_PLUGIN_DIR,conservation_file:${conservation_file},gerp_bigwig:${gerp_bigwig} \\\n",
" --plugin CADD,snv=${cadd_snps},indels=${cadd_indels}\n",
" bgzip ${_output:n}\n",
" bgzip ${_output:n}\n",
" tabix ${_output}"
]
},
Expand All @@ -293,15 +300,16 @@
"outputs": [],
"source": [
"[default_2]\n",
"stop_if(output_vcf)\n",
"output: f'{cwd}/{_input:bnn}.formatted.tsv.gz'\n",
"python: expand= \"${ }\", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout'\n",
" pandas as pd\n",
"python: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n",
" import pandas as pd\n",
" import gzip\n",
" # Initialize an empty list to store the comments\n",
" comments = []\n",
"\n",
" # Open the gzip-compressed VCF file and read it line by line\n",
" with gzip.open(${_input}, 'rt') as file:\n",
" with gzip.open('${_input}', 'rt') as file:\n",
" for line in file:\n",
" if line.startswith('#'):\n",
" comments.append(line.strip())\n",
Expand All @@ -314,9 +322,19 @@
" header = header_comment.split('#')[1]\n",
" # Split the header into column names\n",
" columns = header.split('\\t')\n",
" df = pd.read_csv(${_input}, compression='gzip', comment='#', sep='\\t', names=columns)\n",
" df.to_csv(${_output}, index=False, sep='\\t', header=True, compression={'method': 'gzip', 'compresslevel': 9})"
" df = pd.read_csv('${_input}', compression='gzip', comment='#', sep='\\t', names=columns)\n",
" df.to_csv('${_output}', index=False, sep='\\t',header=True, compression={'method': 'gzip', 'compresslevel': 9})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47f12c92-5c4c-43d8-ab07-3124b2d59fba",
"metadata": {
"kernel": "SoS"
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -342,7 +360,7 @@
""
]
],
"version": "0.20.1"
"version": "0.24.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit ecbcc26

Please sign in to comment.