Merge pull request #12 from hoelzer/diamond-switch

finalize blast-diamond switch
hoelzer · Oct 7, 2023 · bb97979 · bb97979
2 parents 3e3d673 + 5f3efc2
commit bb97979
Show file tree

Hide file tree

Showing 12 changed files with 49 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -1,15 +1,17 @@
 # Calculation of the Percentage Of Conserved Proteins
 
-![](https://img.shields.io/badge/nextflow-20.10.0-brightgreen)
+![](https://img.shields.io/badge/nextflow->=20.01.0-brightgreen)
 ![](https://img.shields.io/badge/uses-ruby-red)
-![](https://img.shields.io/badge/can_use-conda-yellow.svg)
+![](https://img.shields.io/badge/can_use-conda/mamba-yellow.svg)
 ![](https://img.shields.io/badge/can_use-docker-blue.svg)
 ![](https://img.shields.io/badge/can_use-singularity-orange.svg)
 ![](https://img.shields.io/badge/licence-GLP3-lightgrey.svg)
 
 [![Twitter Follow](https://img.shields.io/twitter/follow/martinhoelzer.svg?style=social)](https://twitter.com/martinhoelzer) 
 
-__Update 2023: Re-implementation as a [Nextflow pipeline](nextflow.io). Please feel free to report any [issues](https://github.com/hoelzer/pocp/issues)!__
+__Update 2023/05: Re-implementation as a [Nextflow pipeline](nextflow.io). Please feel free to report any [issues](https://github.com/hoelzer/pocp/issues)!__
+
+__Update 2023/10: Now using [Diamond](https://www.nature.com/articles/s41592-021-01101-x) instead of Blast for protein alignments. Thx [@michoug](https://github.com/michoug) for the Pull Request.__
 
 As input use one amino acid sequence FASTA file per genome such as provided by
 [Prokka](https://github.com/tseemann/prokka) or genome FASTA files which will be then annotated via [Prokka](https://github.com/tseemann/prokka). 
@@ -26,26 +28,25 @@ nextflow pull hoelzer/pocp
 # check availble release versions and development branches
 nextflow info hoelzer/pocp 
 
-# get the help page and define a release version
-nextflow run hoelzer/pocp -r 2.0.0 --help
+# get the help page and define a release version. ATTENTION: use latest version. 
+nextflow run hoelzer/pocp -r 2.1.0 --help
 
 # example with genome files as input, performing a local execution and using Docker
-nextflow run hoelzer/pocp -r 2.0.0 --genomes 'example/*.fasta' -profile local,docker
+nextflow run hoelzer/pocp -r 2.1.0 --genomes 'example/*.fasta' -profile local,docker
 
 # example with protein FASTA files as input (e.g. from Prokka pre-calculated), performing a SLURM execution and using conda
-nextflow run hoelzer/pocp -r 2.0.0 --proteins 'example/*.faa' -profile slurm,conda
+nextflow run hoelzer/pocp -r 2.1.0 --proteins 'example/*.faa' -profile slurm,conda
 ```
 
-The final output (`pocp-matrix.tsv`) should look like this (here, the resulting `pocp-matrix.tsv` was imported into LibreOffice and formated):
+The final output (`pocp-matrix.tsv`) should look like this (here, the resulting TSV was imported into Numbers on MacOS):
 
 ![Example output](example_output.png)
 
-If needed, the following parameters used for filtering the `blast` results can be
+If needed, the following parameters used for filtering the `diamond` results (blastp mode) can be
 adjusted:
 
 ```bash
 --evalue 1e-5
 --seqidentity 0.4
 --alnlength 0.5
 ```
-
diff --git a/configs/conda.config b/configs/conda.config
@@ -1,6 +1,6 @@
 process {
     withLabel:	prokka	{ conda = "$baseDir/envs/prokka.yaml" } 
-    withLabel:	blast	{ conda = "$baseDir/envs/blast.yaml" } 
-    withLabel:	pocp	{ conda = "$baseDir/envs/blast.yaml" } 
+    withLabel:	diamond	{ conda = "$baseDir/envs/diamond.yaml" } 
+    withLabel:	pocp	{ conda = "$baseDir/envs/diamond.yaml" } 
     withLabel:	ruby	{ conda = "$baseDir/envs/ruby.yaml" } 
 }
diff --git a/configs/container.config b/configs/container.config
@@ -1,6 +1,6 @@
 process {
     withLabel: prokka            { container = 'nanozoo/prokka:1.14.6--773a90d' }
-    withLabel: blast             { container = 'nanozoo/blast:2.9.0--ded80ad' }
-    withLabel: pocp              { container = 'nanozoo/blast:2.9.0--ded80ad' } 
+    withLabel: diamond           { container = 'nanozoo/diamond:2.1.8--b6f1f11' }
+    withLabel: pocp              { container = 'nanozoo/diamond:2.1.8--b6f1f11' } 
     withLabel: ruby              { container = 'nanozoo/ruby:3.2.2--fdadcb3' } 
 }
diff --git a/configs/local.config b/configs/local.config
@@ -1,6 +1,6 @@
 process {
-    withLabel: prokka { cpus = params.cores; memory = params.memory } 
-    withLabel: blast { cpus = params.cores; memory = params.memory } 
-    withLabel: pocp { cpus = 1; memory = '1 GB' } 
-    withLabel: ruby { cpus = 1; memory = '1 GB' } 
+    withLabel: prokka   { cpus = params.cores; memory = params.memory } 
+    withLabel: diamond  { cpus = params.cores; memory = params.memory } 
+    withLabel: pocp     { cpus = 1; memory = '1 GB' } 
+    withLabel: ruby     { cpus = 1; memory = '1 GB' } 
 }
diff --git a/configs/nodes.config b/configs/nodes.config
@@ -1,6 +1,6 @@
 process {   
-    withLabel: prokka { cpus = 8 ; memory = '2 GB' }
-    withLabel: blast { cpus = 8 ; memory = '2 GB' }
-    withLabel: pocp { cpus = 1 ; memory = '1 GB' }
-    withLabel: ruby { cpus = 1 ; memory = '1 GB' }
+    withLabel: prokka   { cpus = 8 ; memory = '2 GB' }
+    withLabel: diamond  { cpus = 8 ; memory = '2 GB' }
+    withLabel: pocp     { cpus = 1 ; memory = '1 GB' }
+    withLabel: ruby     { cpus = 1 ; memory = '1 GB' }
 }
diff --git a/envs/blast.yaml b/envs/blast.yaml
diff --git a/envs/diamond.yaml b/envs/diamond.yaml
@@ -0,0 +1,7 @@
+name: diamond
+channels:
+  - bioconda
+dependencies:
+  - diamond=2.1.8
+  - python=3.11.4
+
diff --git a/example_output.png b/example_output.png
diff --git a/main.nf b/main.nf
@@ -44,7 +44,7 @@ if (params.proteins && params.list) { proteins_input_ch = Channel
 
 // load modules
 include {prokka} from './modules/prokka'
-include {blast} from './modules/blast'
+include {diamond} from './modules/diamond'
 include {pocp; pocp_matrix} from './modules/pocp'
 
 // main workflow
@@ -60,10 +60,10 @@ workflow {
             return tuple( id1, faa1, id2, faa2 )
     }
 
-    blast_hits_ch = blast(allvsall_ch).hits.groupTuple()
+    diamond_hits_ch = diamond(allvsall_ch).hits.groupTuple()
 
     pocp_matrix(
-      pocp(blast_hits_ch).map {comparison, pocp_file, pocp_value -> [pocp_file]}.collect()
+      pocp(diamond_hits_ch).map {comparison, pocp_file, pocp_value -> [pocp_file]}.collect()
     )
 }
 
@@ -94,9 +94,9 @@ def helpMSG() {
 
     ${c_yellow}Options:${c_reset}
     --gcode             genetic code for Prokka annotation [default: $params.gcode]
-    --evalue            Evalue for blastp [default: $params.evalue]
-    --seqidentity       Sequence identity for blastp [default: $params.seqidentity]
-    --alnlength         Alignment length for blastp [default: $params.alnlength]
+    --evalue            Evalue for diamond protein search [default: $params.evalue]
+    --seqidentity       Sequence identity for diamond alignments [default: $params.seqidentity]
+    --alnlength         Alignment length for diamond hits [default: $params.alnlength]
     --cores             max cores per process for local use [default: $params.cores]
     --max_cores         max cores (in total) for local use [default: $params.max_cores]
     --memory            max memory for local use [default: $params.memory]

diff --git a/modules/blast.nf → modules/diamond.nf b/modules/blast.nf → modules/diamond.nf
@@ -1,31 +1,31 @@
 /*
-Run blastp and format output for downstream POCP calculations.
+Run diamond and format output for downstream POCP calculations.
+https://www.nature.com/articles/s41592-021-01101-x
 */
-process blast {
-    label 'blast'
-    publishDir "${params.output}/blast", mode: 'copy', pattern: "${name}-query-${name2}-db.blast*"
+process diamond {
+    label 'diamond'
+    publishDir "${params.output}/diamond", mode: 'copy', pattern: "${name}-query-${name2}-db.diamond*"
 
     input:
       tuple val(name), path(fasta), val(name2), path(fasta2) 
 
     output:
-	    tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.blast"), emit: blast 
-	    tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.blast.hits"), emit: hits
+	    tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.diamond"), emit: diamond 
+	    tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.diamond.hits"), emit: hits
 
     script:
     """
     diamond makedb --in ${fasta2} -d ${fasta2}.dmnd 
-    diamond blastp --ultra-sensitive -p ${task.cpus} -q ${fasta} -d ${fasta2}.dmnd -e ${params.evalue} --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend qlen sstart send evalue bitscore slen | awk '{if(\$3>${params.seqidentity*100} && \$4>(\$9*${params.alnlength})){print \$0}}' > ${name}-query-${name2}-db.blast
-    awk '{print \$1}' ${name}-query-${name2}-db.blast | sort | uniq | wc -l | awk '{print \$1}' > ${name}-query-${name2}-db.blast.hits
-    echo "\t${name}:\tFound \$(cat ${name}-query-${name2}-db.blast.hits) matches with an E value of less than ${params.evalue}, a sequence identity of more than ${params.seqidentity*100}%, and an alignable region of the query protein sequence of more than ${params.alnlength*100}%."
+    diamond blastp --ultra-sensitive -p ${task.cpus} -q ${fasta} -d ${fasta2}.dmnd -e ${params.evalue} --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend qlen sstart send evalue bitscore slen | awk '{if(\$3>${params.seqidentity*100} && \$4>(\$9*${params.alnlength})){print \$0}}' > ${name}-query-${name2}-db.diamond
+    awk '{print \$1}' ${name}-query-${name2}-db.diamond | sort | uniq | wc -l | awk '{print \$1}' > ${name}-query-${name2}-db.diamond.hits
+    echo "\t${name}:\tFound \$(cat ${name}-query-${name2}-db.diamond.hits) matches with an E value of less than ${params.evalue}, a sequence identity of more than ${params.seqidentity*100}%, and an alignable region of the query protein sequence of more than ${params.alnlength*100}%."
 
     genome_ids_sorted='${name} ${name2}'
     genome_ids_sorted=\$(echo \$genome_ids_sorted | xargs -n1 | sort | xargs | sed 's/ /-vs-/g')
-
-    #ruby pocp.rb ${fasta} ${fasta2} ${params.evalue} ${params.seqidentity} ${params.alnlength} ./ ${task.cpus}
     """
 }
 
 /* Comments:
+When blast was used:
 I removed the -parse_seqids parameter from the makeblastdb command because of an error with fasta IDs that are longer than 50 chars. strange.
 */
diff --git a/modules/pocp.nf b/modules/pocp.nf
@@ -4,7 +4,7 @@ Calculate POCP value:
   puts "\n\nC1:\t#{c1}\nC2:\t#{c2}\n\nT1:\t#{t1}\nT2:\t#{t2}\n\nPOCP = [(C1+C2)/(T1+T2)]*100% = #{pocp}"
 */
 process pocp {
-    label 'blast'
+    label 'diamond'
     publishDir "${params.output}/pocp", mode: 'copy', pattern: "${genome_names}.txt"
 
     input:

diff --git a/nextflow.config b/nextflow.config
@@ -22,7 +22,7 @@ params {
     // Prokka
     gcode = 0
 
-    // Blast
+    // Diamond protein alignment
     evalue = '1e-5'
     seqidentity = 0.4
     alnlength = 0.5