Merge pull request #177 from dianacornejo/master

update vep to handle multiple VCF's
cumc · Feb 13, 2024 · d83dbe1 · d83dbe1
2 parents f5d356d + 0e1038e
commit d83dbe1
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 9 deletions.
diff --git a/BAT/README b/BAT/README
@@ -0,0 +1,22 @@
+# Biobank Association Tools (BAT)
+
+## 1. Data processing
+### 1.1 Genotype
+ - VCF quality control [bcftools]
+ - Principal Component Analysis (PCA) [fastGWA]
+ - Kinship Analysis [PLINK]
+### 1.2 Phenotype
+ - Phenotype selection UK Biobank (FIXME: not sure about this because we only have the example of hearing impairment)
+## 2. Genome-wide association testing (GWAS)
+ - LMM workflow (Regenie, BoltLMM, SAIGE)
+## 3. Burden and rare-variant aggregate testing
+ - Annotation of variants (Annovar & VEP)
+ - Creation of burden files
+ - LMM workflow (specifically for rare-variants)
+## 4. Fine mapping
+ - LD clumping (PLINK)
+ - Region extraction (FIXME: this pipeline needs upgrades)
+ - SuSie RSS
+## 5. Data visualization
+ - Hudson plot
+ - Other ways of visualizing data
diff --git a/variant-annotation/annovar-rap.ipynb b/variant-annotation/annovar-rap.ipynb
@@ -489,7 +489,7 @@
  " awk 'NR>1 {if (length ($4) > 1) {print $1,$2,$2 + (length ($4) - 1),$4,$5} else {print $1,$2,$2,$4,$5}}' ${_input} > ${_output}\n",
  " else\n",
  " echo \"Input file does not have .bim extension, it will be considered as a summary stats and the columns are chrom, genpos, ID, allele0(ref), allele1(alt). Please confirm that your file has this format\"\n",
- " awk '{if (length ($4) > 1) {print $1, $2, $2 + (length ($4) - 1), $4, $5} else {print $1, $2, $2, $4, $5}}' ${_input} > ${_output}\n",
+ " awk 'NR>1 {if (length ($4) > 1) {print $1, $2, $2 + (length ($4) - 1), $4, $5, $3} else {print $1, $2, $2, $4, $5, $3}}' ${_input} > ${_output}\n",
  " fi"
  ]
  },

diff --git a/variant-annotation/vep.ipynb b/variant-annotation/vep.ipynb
@@ -112,7 +112,8 @@
  " - [gnomAD_exome_index_v2.1.1 / GRCh38](https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi)(1.3M)\n",
  " \n",
  " - [gnomAD_genome_v2.1.1 / GRCh37/hg19 ](https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz)(461Gb)\n",
- " - [gnomAD_genome_index_v2.1.1 / GRCh37/hg19 ](https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi)(2.73M)"
+ " - [gnomAD_genome_index_v2.1.1 / GRCh37/hg19 ](https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi)(2.73M)\n",
+ " - gnomad_v4.0 in Columbia's cluster located in `data_public/gnomad/gnomad.genomes.v4.0.0.sites.afonly.vcf.gz`"
  ]
  },
  {
@@ -132,7 +133,7 @@
  "kernel": "SoS"
  },
  "source": [
- "Use `output_vcf` if you want vcf output (default option) or `--no-output-vcf` if you want tsv output.\n",
+ "Use `output_tsv` if you want tsv output (default option is vcf).\n",
  "\n",
  "```\n",
  "sos run vep.ipynb \\\n",
@@ -144,11 +145,12 @@
  "--cadd_snps data/cadd/whole_genome_SNVs_inclAnno.tsv.gz \\\n",
  "--cadd_indels data/cadd/gnomad.genomes.r3.0.indel.tsv.gz \\\n",
  "--clinvar_db data/clinvar/clinvar_20231028.vcf.gz \\\n",
+ "--gnomad data/gnomad/gnomad.genomes.v4.0.0.sites.afonly.vcf.gz \\\n",
  "--cache_version 103 \\\n",
  "--dir_cache data/vep \\\n",
  "--walltime 30h \\\n",
  "--mem 30G \\\n",
- "--no-output-vcf\n",
+ "--output-tsv\n",
  "```\n"
  ]
  },
@@ -185,7 +187,9 @@
  "\n",
  "* In this piepline we are using the cache version 103 with the VEP install 110 if this is not what you want please modify the parameters accordingly `cache_version`\n",
  "\n",
- "* If you would like to annovate clinvar database add the `clinvar_db` paramater which in Columbia's cluster is `/mnt/vast/hpc/csg/data_public/clinvar/clinvar_20231028.vcf.gz`"
+ "* If you would like to annovate clinvar database add the `clinvar_db` paramater which in Columbia's cluster is `/mnt/vast/hpc/csg/data_public/clinvar/clinvar_20231028.vcf.gz`\n",
+ "\n",
+ "* If you are looking to annotate the allele frequencies using the latest gnomad v4 database (genomes) it is located in Columbia's cluster in the following path `/mnt/vast/hpc/csg/data_public/gnomad/gnomad.genomes.v4.0.0.sites.afonly.vcf.gz`"
  ]
  },
  {
@@ -213,7 +217,7 @@
  "# Specific number of threads to use\n",
  "parameter: numThreads = 2\n",
  "# Input vcf file to annotate\n",
- "parameter: vcf = path\n",
+ "parameter: vcf = paths\n",
  "# Human ancestor database\n",
  "parameter: human_ancestor = path\n",
  "# Convervation file path\n",
@@ -226,6 +230,8 @@
  "parameter: cadd_indels = path\n",
  "# Clinvar datavase\n",
  "parameter: clinvar_db = path('.')\n",
+ "# gnomAD database\n",
+ "parameter: gnomad = path\n",
  "# Cache version to use\n",
  "parameter: cache_version = int\n",
  "# Genome assembly to use\n",
@@ -252,7 +258,7 @@
  "outputs": [],
  "source": [
  "[default_1]\n",
- "input: vcf\n",
+ "input: vcf, group_by=1\n",
  "output: f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.tsv.gz' if output_tsv else f'{cwd}/{_input:bn}.VEP.CADD_gnomAD.vcf.gz'\n",
  "parameter: vep_window = 10000\n",
  "parameter: most_severe = False\n",
@@ -275,10 +281,11 @@
  " --dir_plugins $VEP_PLUGIN_DIR \\\n",
  " ${('--everything') if everything else ''} \\\n",
  " ${('--custom file='+ str(clinvar_db)+',short_name=ClinVar,format=vcf,type=exact,coords=0,fields=CLNSIG%CLNREVSTAT%CLNDN') if clinvar_db else ''} \\\n",
+ " --custom ${gnomad},gnomADg,vcf,exact,0,AF_joint_afr,AF_joint_amr,AF_joint_asj,AF_joint_eas,AF_joint_sas,AF_joint_fin,AF_joint_nfe \\\n",
  " --plugin LoF,human_ancestor_fa:${human_ancestor},loftee_path:$VEP_PLUGIN_DIR,conservation_file:${conservation_file},gerp_bigwig:${gerp_bigwig} \\\n",
  " --plugin CADD,snv=${cadd_snps},indels=${cadd_indels}\n",
  " bgzip ${_output:n}\n",
- " tabix ${_output}"
+ " ${\"\" if output_tsv else (\"tabix \" + str( _output))}"
  ]
  },
  {
@@ -302,6 +309,7 @@
  "source": [
  "[default_2]\n",
  "stop_if(not output_tsv)\n",
+ "input: group_by=1\n",
  "output: f'{cwd}/{_input:bnn}.formatted.tsv.gz'\n",
  "task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
  "python: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n",
@@ -324,9 +332,56 @@
  " header = header_comment.split('#')[1]\n",
  " # Split the header into column names\n",
  " columns = header.split('\\t')\n",
+ " # Read dataframe with correct column names\n",
  " df = pd.read_csv('${_input}', compression='gzip', comment='#', sep='\\t', names=columns, dtype='string')\n",
+ " # Split the values in 'Location' and create new columns\n",
+ " df[['chromosome', 'position']] = df['Location'].str.split(':', expand=True)\n",
+ " df[['start', 'end']] = df['position'].str.split('-', expand=True)\n",
+ " # If there are only two values, set the end column equal to the start column\n",
+ " df['end'].fillna(df['start'], inplace=True)\n",
+ " # Drop the intermediate columns\n",
+ " df.drop(['position'], axis=1, inplace=True)\n",
+ " # Reorder the columns if necessary\n",
+ " column_order = ['chromosome', 'start', 'end'] + [col for col in df.columns if col not in ['chromosome', 'start', 'end']]\n",
+ " df = df[column_order]\n",
  " df.to_csv('${_output}', index=False, sep='\\t',header=True, compression={'method': 'gzip', 'compresslevel': 9})"
  ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bbf41422-947e-4277-ab15-7b349d7ad04d",
+ "metadata": {
+ "kernel": "SoS"
+ },
+ "source": [
+ "FIXME: for tsv the indexing needs to be done differently with tabix, however the gz file needs to be bgziped "
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "id": "76a8cb5f-ad3e-42cc-884e-251edb3094ed",
+ "metadata": {
+ "kernel": "SoS"
+ },
+ "source": [
+ "[default_3]\n",
+ "stop_if(not output_tsv)\n",
+ "output: f'{cwd}/{_input:bnn}.formatted.tsv.gz'\n",
+ "task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
+ "bash: container=container, entrypoint=\"micromamba run -a '' -n rare_variation\", expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n",
+ " bgzip ${_input}\n",
+ " tabix -s 1 -b 2 -e 3 ${_output}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47f12c92-5c4c-43d8-ab07-3124b2d59fba",
+ "metadata": {
+ "kernel": "SoS"
+ },
+ "outputs": [],
+ "source": []
  }
  ],
  "metadata": {
@@ -352,7 +407,7 @@
  ""
  ]
  ],
- "version": "0.24.1"
+ "version": "0.24.3"
  }
  },
  "nbformat": 4,