diff --git a/README.md b/README.md index e0b5315..5b645cc 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,17 @@ # Calculation of the Percentage Of Conserved Proteins -![](https://img.shields.io/badge/nextflow-20.10.0-brightgreen) +![](https://img.shields.io/badge/nextflow->=20.01.0-brightgreen) ![](https://img.shields.io/badge/uses-ruby-red) -![](https://img.shields.io/badge/can_use-conda-yellow.svg) +![](https://img.shields.io/badge/can_use-conda/mamba-yellow.svg) ![](https://img.shields.io/badge/can_use-docker-blue.svg) ![](https://img.shields.io/badge/can_use-singularity-orange.svg) ![](https://img.shields.io/badge/licence-GLP3-lightgrey.svg) [![Twitter Follow](https://img.shields.io/twitter/follow/martinhoelzer.svg?style=social)](https://twitter.com/martinhoelzer) -__Update 2023: Re-implementation as a [Nextflow pipeline](nextflow.io). Please feel free to report any [issues](https://github.com/hoelzer/pocp/issues)!__ +__Update 2023/05: Re-implementation as a [Nextflow pipeline](nextflow.io). Please feel free to report any [issues](https://github.com/hoelzer/pocp/issues)!__ + +__Update 2023/10: Now using [Diamond](https://www.nature.com/articles/s41592-021-01101-x) instead of Blast for protein alignments. Thx [@michoug](https://github.com/michoug) for the Pull Request.__ As input use one amino acid sequence FASTA file per genome such as provided by [Prokka](https://github.com/tseemann/prokka) or genome FASTA files which will be then annotated via [Prokka](https://github.com/tseemann/prokka). @@ -26,21 +28,21 @@ nextflow pull hoelzer/pocp # check availble release versions and development branches nextflow info hoelzer/pocp -# get the help page and define a release version -nextflow run hoelzer/pocp -r 2.0.0 --help +# get the help page and define a release version. ATTENTION: use latest version. +nextflow run hoelzer/pocp -r 2.1.0 --help # example with genome files as input, performing a local execution and using Docker -nextflow run hoelzer/pocp -r 2.0.0 --genomes 'example/*.fasta' -profile local,docker +nextflow run hoelzer/pocp -r 2.1.0 --genomes 'example/*.fasta' -profile local,docker # example with protein FASTA files as input (e.g. from Prokka pre-calculated), performing a SLURM execution and using conda -nextflow run hoelzer/pocp -r 2.0.0 --proteins 'example/*.faa' -profile slurm,conda +nextflow run hoelzer/pocp -r 2.1.0 --proteins 'example/*.faa' -profile slurm,conda ``` -The final output (`pocp-matrix.tsv`) should look like this (here, the resulting `pocp-matrix.tsv` was imported into LibreOffice and formated): +The final output (`pocp-matrix.tsv`) should look like this (here, the resulting TSV was imported into Numbers on MacOS): ![Example output](example_output.png) -If needed, the following parameters used for filtering the `blast` results can be +If needed, the following parameters used for filtering the `diamond` results (blastp mode) can be adjusted: ```bash @@ -48,4 +50,3 @@ adjusted: --seqidentity 0.4 --alnlength 0.5 ``` - diff --git a/configs/conda.config b/configs/conda.config index 65c4a74..6ff463a 100644 --- a/configs/conda.config +++ b/configs/conda.config @@ -1,6 +1,6 @@ process { withLabel: prokka { conda = "$baseDir/envs/prokka.yaml" } - withLabel: blast { conda = "$baseDir/envs/blast.yaml" } - withLabel: pocp { conda = "$baseDir/envs/blast.yaml" } + withLabel: diamond { conda = "$baseDir/envs/diamond.yaml" } + withLabel: pocp { conda = "$baseDir/envs/diamond.yaml" } withLabel: ruby { conda = "$baseDir/envs/ruby.yaml" } } \ No newline at end of file diff --git a/configs/container.config b/configs/container.config index cce93e7..5a098d9 100644 --- a/configs/container.config +++ b/configs/container.config @@ -1,6 +1,6 @@ process { withLabel: prokka { container = 'nanozoo/prokka:1.14.6--773a90d' } - withLabel: blast { container = 'nanozoo/blast:2.9.0--ded80ad' } - withLabel: pocp { container = 'nanozoo/blast:2.9.0--ded80ad' } + withLabel: diamond { container = 'nanozoo/diamond:2.1.8--b6f1f11' } + withLabel: pocp { container = 'nanozoo/diamond:2.1.8--b6f1f11' } withLabel: ruby { container = 'nanozoo/ruby:3.2.2--fdadcb3' } } \ No newline at end of file diff --git a/configs/local.config b/configs/local.config index 8931b80..e6b05c8 100644 --- a/configs/local.config +++ b/configs/local.config @@ -1,6 +1,6 @@ process { - withLabel: prokka { cpus = params.cores; memory = params.memory } - withLabel: blast { cpus = params.cores; memory = params.memory } - withLabel: pocp { cpus = 1; memory = '1 GB' } - withLabel: ruby { cpus = 1; memory = '1 GB' } + withLabel: prokka { cpus = params.cores; memory = params.memory } + withLabel: diamond { cpus = params.cores; memory = params.memory } + withLabel: pocp { cpus = 1; memory = '1 GB' } + withLabel: ruby { cpus = 1; memory = '1 GB' } } \ No newline at end of file diff --git a/configs/nodes.config b/configs/nodes.config index fee73bd..bbc7a55 100644 --- a/configs/nodes.config +++ b/configs/nodes.config @@ -1,6 +1,6 @@ process { - withLabel: prokka { cpus = 8 ; memory = '2 GB' } - withLabel: blast { cpus = 8 ; memory = '2 GB' } - withLabel: pocp { cpus = 1 ; memory = '1 GB' } - withLabel: ruby { cpus = 1 ; memory = '1 GB' } + withLabel: prokka { cpus = 8 ; memory = '2 GB' } + withLabel: diamond { cpus = 8 ; memory = '2 GB' } + withLabel: pocp { cpus = 1 ; memory = '1 GB' } + withLabel: ruby { cpus = 1 ; memory = '1 GB' } } \ No newline at end of file diff --git a/envs/blast.yaml b/envs/blast.yaml deleted file mode 100644 index 8066284..0000000 --- a/envs/blast.yaml +++ /dev/null @@ -1,7 +0,0 @@ -name: blast -channels: - - bioconda -dependencies: - - diamond=2.1 - - python=3.8 - \ No newline at end of file diff --git a/envs/diamond.yaml b/envs/diamond.yaml new file mode 100644 index 0000000..7089a06 --- /dev/null +++ b/envs/diamond.yaml @@ -0,0 +1,7 @@ +name: diamond +channels: + - bioconda +dependencies: + - diamond=2.1.8 + - python=3.11.4 + \ No newline at end of file diff --git a/example_output.png b/example_output.png index 8dabddb..44bb3b8 100644 Binary files a/example_output.png and b/example_output.png differ diff --git a/main.nf b/main.nf index 18ddf25..ee9c6aa 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,7 @@ if (params.proteins && params.list) { proteins_input_ch = Channel // load modules include {prokka} from './modules/prokka' -include {blast} from './modules/blast' +include {diamond} from './modules/diamond' include {pocp; pocp_matrix} from './modules/pocp' // main workflow @@ -60,10 +60,10 @@ workflow { return tuple( id1, faa1, id2, faa2 ) } - blast_hits_ch = blast(allvsall_ch).hits.groupTuple() + diamond_hits_ch = diamond(allvsall_ch).hits.groupTuple() pocp_matrix( - pocp(blast_hits_ch).map {comparison, pocp_file, pocp_value -> [pocp_file]}.collect() + pocp(diamond_hits_ch).map {comparison, pocp_file, pocp_value -> [pocp_file]}.collect() ) } @@ -94,9 +94,9 @@ def helpMSG() { ${c_yellow}Options:${c_reset} --gcode genetic code for Prokka annotation [default: $params.gcode] - --evalue Evalue for blastp [default: $params.evalue] - --seqidentity Sequence identity for blastp [default: $params.seqidentity] - --alnlength Alignment length for blastp [default: $params.alnlength] + --evalue Evalue for diamond protein search [default: $params.evalue] + --seqidentity Sequence identity for diamond alignments [default: $params.seqidentity] + --alnlength Alignment length for diamond hits [default: $params.alnlength] --cores max cores per process for local use [default: $params.cores] --max_cores max cores (in total) for local use [default: $params.max_cores] --memory max memory for local use [default: $params.memory] diff --git a/modules/blast.nf b/modules/diamond.nf similarity index 53% rename from modules/blast.nf rename to modules/diamond.nf index da26d98..5b6d968 100644 --- a/modules/blast.nf +++ b/modules/diamond.nf @@ -1,31 +1,31 @@ /* -Run blastp and format output for downstream POCP calculations. +Run diamond and format output for downstream POCP calculations. +https://www.nature.com/articles/s41592-021-01101-x */ -process blast { - label 'blast' - publishDir "${params.output}/blast", mode: 'copy', pattern: "${name}-query-${name2}-db.blast*" +process diamond { + label 'diamond' + publishDir "${params.output}/diamond", mode: 'copy', pattern: "${name}-query-${name2}-db.diamond*" input: tuple val(name), path(fasta), val(name2), path(fasta2) output: - tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.blast"), emit: blast - tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.blast.hits"), emit: hits + tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.diamond"), emit: diamond + tuple env(genome_ids_sorted), path(fasta), file("${name}-query-${name2}-db.diamond.hits"), emit: hits script: """ diamond makedb --in ${fasta2} -d ${fasta2}.dmnd - diamond blastp --ultra-sensitive -p ${task.cpus} -q ${fasta} -d ${fasta2}.dmnd -e ${params.evalue} --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend qlen sstart send evalue bitscore slen | awk '{if(\$3>${params.seqidentity*100} && \$4>(\$9*${params.alnlength})){print \$0}}' > ${name}-query-${name2}-db.blast - awk '{print \$1}' ${name}-query-${name2}-db.blast | sort | uniq | wc -l | awk '{print \$1}' > ${name}-query-${name2}-db.blast.hits - echo "\t${name}:\tFound \$(cat ${name}-query-${name2}-db.blast.hits) matches with an E value of less than ${params.evalue}, a sequence identity of more than ${params.seqidentity*100}%, and an alignable region of the query protein sequence of more than ${params.alnlength*100}%." + diamond blastp --ultra-sensitive -p ${task.cpus} -q ${fasta} -d ${fasta2}.dmnd -e ${params.evalue} --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend qlen sstart send evalue bitscore slen | awk '{if(\$3>${params.seqidentity*100} && \$4>(\$9*${params.alnlength})){print \$0}}' > ${name}-query-${name2}-db.diamond + awk '{print \$1}' ${name}-query-${name2}-db.diamond | sort | uniq | wc -l | awk '{print \$1}' > ${name}-query-${name2}-db.diamond.hits + echo "\t${name}:\tFound \$(cat ${name}-query-${name2}-db.diamond.hits) matches with an E value of less than ${params.evalue}, a sequence identity of more than ${params.seqidentity*100}%, and an alignable region of the query protein sequence of more than ${params.alnlength*100}%." genome_ids_sorted='${name} ${name2}' genome_ids_sorted=\$(echo \$genome_ids_sorted | xargs -n1 | sort | xargs | sed 's/ /-vs-/g') - - #ruby pocp.rb ${fasta} ${fasta2} ${params.evalue} ${params.seqidentity} ${params.alnlength} ./ ${task.cpus} """ } /* Comments: +When blast was used: I removed the -parse_seqids parameter from the makeblastdb command because of an error with fasta IDs that are longer than 50 chars. strange. */ \ No newline at end of file diff --git a/modules/pocp.nf b/modules/pocp.nf index e040190..0a197bb 100644 --- a/modules/pocp.nf +++ b/modules/pocp.nf @@ -4,7 +4,7 @@ Calculate POCP value: puts "\n\nC1:\t#{c1}\nC2:\t#{c2}\n\nT1:\t#{t1}\nT2:\t#{t2}\n\nPOCP = [(C1+C2)/(T1+T2)]*100% = #{pocp}" */ process pocp { - label 'blast' + label 'diamond' publishDir "${params.output}/pocp", mode: 'copy', pattern: "${genome_names}.txt" input: diff --git a/nextflow.config b/nextflow.config index 627cf95..d7e7fec 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,7 +22,7 @@ params { // Prokka gcode = 0 - // Blast + // Diamond protein alignment evalue = '1e-5' seqidentity = 0.4 alnlength = 0.5