From f610833f12afac26739ee8a8b4f0361740b8ca95 Mon Sep 17 00:00:00 2001 From: martin Date: Tue, 12 Dec 2023 10:02:21 +0100 Subject: [PATCH 1/4] add one-vs-all functionality --- README.md | 24 +++++++++++++-------- main.nf | 55 +++++++++++++++++++++++++++++++++++++++++-------- nextflow.config | 3 +++ 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 5b645cc..8a0bd5d 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,12 @@ __Update 2023/05: Re-implementation as a [Nextflow pipeline](nextflow.io). Pleas __Update 2023/10: Now using [Diamond](https://www.nature.com/articles/s41592-021-01101-x) instead of Blast for protein alignments. Thx [@michoug](https://github.com/michoug) for the Pull Request.__ +__Update 2023/12: One-vs-All comparisons are now possible in genome and protein input mode. Check `--help` message`.__ + As input use one amino acid sequence FASTA file per genome such as provided by [Prokka](https://github.com/tseemann/prokka) or genome FASTA files which will be then annotated via [Prokka](https://github.com/tseemann/prokka). -The pipeline will then calculate all pairwise alignments between all protein sequences and use this -information for POCP calculation following [Qin, Xie _et al_. -2014](https://www.ncbi.nlm.nih.gov/pubmed/24706738). +The pipeline will then calculate all-vs-all pairwise alignments between all protein sequences and use this +information for POCP calculation following [Qin, Xie _et al_. 2014](https://www.ncbi.nlm.nih.gov/pubmed/24706738). For one-vs-all comparisons see below. You only need `nextflow` and `conda` or `mamba` or `docker` or `singularity` to run the pipeline. I recommend using `docker`. Then install and run the pipeline: @@ -25,17 +26,20 @@ You only need `nextflow` and `conda` or `mamba` or `docker` or `singularity` to # get the pipeline code nextflow pull hoelzer/pocp -# check availble release versions and development branches +# check availble release versions and development branches, recommend to use latest release nextflow info hoelzer/pocp # get the help page and define a release version. ATTENTION: use latest version. -nextflow run hoelzer/pocp -r 2.1.0 --help +nextflow run hoelzer/pocp -r 2.2.0 --help + +# example with genome files as input, all-vs-all comparison, performing a local execution and using Docker +nextflow run hoelzer/pocp -r 2.2.0 --genomes 'example/*.fasta' -profile local,docker -# example with genome files as input, performing a local execution and using Docker -nextflow run hoelzer/pocp -r 2.1.0 --genomes 'example/*.fasta' -profile local,docker +# example with protein FASTA files as input (e.g. from Prokka pre-calculated), all-vs-all comparison, performing a SLURM execution and using conda +nextflow run hoelzer/pocp -r 2.2.0 --proteins 'example/*.faa' -profile slurm,conda -# example with protein FASTA files as input (e.g. from Prokka pre-calculated), performing a SLURM execution and using conda -nextflow run hoelzer/pocp -r 2.1.0 --proteins 'example/*.faa' -profile slurm,conda +# example with genome files as input, additional genome file to activate one-vs-all comparison, performing a local execution and using Docker +nextflow run hoelzer/pocp -r 2.2.0 --genomes 'example/*.fasta' --genome example/Cav_10DC88.fasta -profile local,docker ``` The final output (`pocp-matrix.tsv`) should look like this (here, the resulting TSV was imported into Numbers on MacOS): @@ -50,3 +54,5 @@ adjusted: --seqidentity 0.4 --alnlength 0.5 ``` + +Please note that per default an "all-vs-all" comparison is performed based on the provided FASTA files. However, you can also switch to an "one-vs-all" comparison by additionally providing a single genome FASTA via `--genome` next to the `--genomes` directory **or** a single protein multi-FASTA via `--protein` next to the `--proteins` directory. In both cases, only "one-vs-all" comparisons will be performed. diff --git a/main.nf b/main.nf index ee9c6aa..31550a8 100644 --- a/main.nf +++ b/main.nf @@ -16,8 +16,9 @@ println " " if (params.help) { exit 0, helpMSG() } if (params.genomes == '' && params.proteins == '') {exit 1, "input missing, use either [--genomes] or [--proteins]"} if (params.genomes != '' && params.proteins != '') {exit 1, "provide one input, use either [--genomes] or [--proteins]"} +if (params.genome && params.protein) {exit 1, "provide only one input, use either [--genome] or [--protein]"} -// genome fasta input & --list support +// genomes fasta input & --list support if (params.genomes && params.list) { genome_input_ch = Channel .fromPath( params.genomes, checkIfExists: true ) .splitCsv() @@ -41,9 +42,20 @@ if (params.proteins && params.list) { proteins_input_ch = Channel .map { file -> tuple(file.baseName, file) } } +// optional input for one-vs-all comparisons instead of all-vs-all +// genome fasta input +if (params.genome) { genome_single_input_ch = Channel + .fromPath( params.genome, checkIfExists: true) + .map { file -> tuple(file.baseName, file) } +} +// protein fasta input +if (params.protein) { protein_single_input_ch = Channel + .fromPath( params.protein, checkIfExists: true) + .map { file -> tuple(file.baseName, file) } +} // load modules -include {prokka} from './modules/prokka' +include {prokka as prokka; prokka as prokka_single} from './modules/prokka' include {diamond} from './modules/diamond' include {pocp; pocp_matrix} from './modules/pocp' @@ -55,12 +67,31 @@ workflow { proteins_ch = proteins_input_ch } - allvsall_ch = proteins_ch.combine(proteins_ch).branch { id1, faa1, id2, faa2 -> - controls: id1 != id2 - return tuple( id1, faa1, id2, faa2 ) + // switch to one-vs-all + if (params.genome) { protein_ch = prokka_single(genome_single_input_ch) } + if (params.protein) { protein_ch = protein_single_input_ch } + + if (params.genome || params.protein) { + // one-vs-all + one_vs_all_ch_part1 = proteins_ch.combine(protein_ch).branch { id1, faa1, id2, faa2 -> + controls: id1 != id2 + return tuple( id1, faa1, id2, faa2 ) + } + one_vs_all_ch_part2 = proteins_ch.combine(protein_ch).branch { id1, faa1, id2, faa2 -> + controls: id1 != id2 + return tuple( id2, faa2, id1, faa1 ) + } + comparisons_ch = one_vs_all_ch_part1.concat(one_vs_all_ch_part2) + } else { + // all-vs-all + all_vs_all_ch = proteins_ch.combine(proteins_ch).branch { id1, faa1, id2, faa2 -> + controls: id1 != id2 + return tuple( id1, faa1, id2, faa2 ) + } + comparisons_ch = all_vs_all_ch } - diamond_hits_ch = diamond(allvsall_ch).hits.groupTuple() + diamond_hits_ch = diamond(comparisons_ch).hits.groupTuple() pocp_matrix( pocp(diamond_hits_ch).map {comparison, pocp_file, pocp_value -> [pocp_file]}.collect() @@ -82,16 +113,22 @@ def helpMSG() { A prokaryotic genus can be defined as a group of species with all pairwise POCP values higher than 50%. ${c_yellow}Usage example:${c_reset} - nextflow run hoelzer/pocp -r 0.0.1 --genomes '*.fasta' + nextflow run hoelzer/pocp -r 2.2.0 --genomes '*.fasta' or - nextflow run hoelzer/pocp -r 0.0.1 --proteins '*.faa' + nextflow run hoelzer/pocp -r 2.2.0 --proteins '*.faa' - ${c_yellow}Input:${c_reset} + ${c_yellow}Input${c_reset} + ${c_yellow}All-vs-all comparisons (default):${c_reset} ${c_green} --genomes ${c_reset} '*.fasta' -> one genome per file or ${c_green} --proteins ${c_reset} '*.faa' -> one protein multi-FASTA per file ${c_dim} ..change above input to csv:${c_reset} ${c_green}--list ${c_reset} + ${c_yellow}Perform one-vs-all comparison against the additionally defined genome or protein FASTA (optional):${c_reset} + --genome genome.fasta -> one genome FASTA + or + --protein proteins.faa -> one protein multi-FASTA + ${c_yellow}Options:${c_reset} --gcode genetic code for Prokka annotation [default: $params.gcode] --evalue Evalue for diamond protein search [default: $params.evalue] diff --git a/nextflow.config b/nextflow.config index d7e7fec..f53bbab 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,9 @@ params { genomes = '' proteins = '' list = false + // for one-vs-all switch + genome = false + protein = false // Prokka gcode = 0 From a22701a8ffb580f9cfc01c4dfa7864168b8a002b Mon Sep 17 00:00:00 2001 From: martin Date: Tue, 12 Dec 2023 10:36:38 +0100 Subject: [PATCH 2/4] finalize README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8a0bd5d..4970b11 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,11 @@ [![Twitter Follow](https://img.shields.io/twitter/follow/martinhoelzer.svg?style=social)](https://twitter.com/martinhoelzer) -__Update 2023/05: Re-implementation as a [Nextflow pipeline](nextflow.io). Please feel free to report any [issues](https://github.com/hoelzer/pocp/issues)!__ +__Update 2023/12: One-vs-All comparisons are now possible in genome and protein input mode. Check `--help` message.__ __Update 2023/10: Now using [Diamond](https://www.nature.com/articles/s41592-021-01101-x) instead of Blast for protein alignments. Thx [@michoug](https://github.com/michoug) for the Pull Request.__ -__Update 2023/12: One-vs-All comparisons are now possible in genome and protein input mode. Check `--help` message`.__ +__Update 2023/05: Re-implementation as a [Nextflow pipeline](nextflow.io). Please feel free to report any [issues](https://github.com/hoelzer/pocp/issues)!__ As input use one amino acid sequence FASTA file per genome such as provided by [Prokka](https://github.com/tseemann/prokka) or genome FASTA files which will be then annotated via [Prokka](https://github.com/tseemann/prokka). @@ -55,4 +55,4 @@ adjusted: --alnlength 0.5 ``` -Please note that per default an "all-vs-all" comparison is performed based on the provided FASTA files. However, you can also switch to an "one-vs-all" comparison by additionally providing a single genome FASTA via `--genome` next to the `--genomes` directory **or** a single protein multi-FASTA via `--protein` next to the `--proteins` directory. In both cases, only "one-vs-all" comparisons will be performed. +Please note that per default an "all-vs-all" comparison is performed based on the provided FASTA files. However, you can also switch to an "one-vs-all" comparison by additionally providing a single genome FASTA via `--genome` next to the `--genomes` input **or** a single protein multi-FASTA via `--protein` next to the `--proteins` input. In both cases, only "one-vs-all" comparisons will be performed. It is also possible to combine `--genomes` with a target `--protein` FASTA for "one-vs-all" and vice versa. From 5f050498e79e1d3567b84be59729f11e02345f80 Mon Sep 17 00:00:00 2001 From: martin Date: Tue, 12 Dec 2023 16:14:28 +0100 Subject: [PATCH 3/4] improve README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4970b11..a087249 100644 --- a/README.md +++ b/README.md @@ -33,13 +33,13 @@ nextflow info hoelzer/pocp nextflow run hoelzer/pocp -r 2.2.0 --help # example with genome files as input, all-vs-all comparison, performing a local execution and using Docker -nextflow run hoelzer/pocp -r 2.2.0 --genomes 'example/*.fasta' -profile local,docker +nextflow run hoelzer/pocp -r 2.2.0 --genomes $HOME'/.nextflow/assets/hoelzer/pocp/example/*.fasta' -profile local,docker # example with protein FASTA files as input (e.g. from Prokka pre-calculated), all-vs-all comparison, performing a SLURM execution and using conda -nextflow run hoelzer/pocp -r 2.2.0 --proteins 'example/*.faa' -profile slurm,conda +nextflow run hoelzer/pocp -r 2.2.0 --proteins $HOME'/.nextflow/assets/hoelzer/pocp/example/*.faa' -profile slurm,conda # example with genome files as input, additional genome file to activate one-vs-all comparison, performing a local execution and using Docker -nextflow run hoelzer/pocp -r 2.2.0 --genomes 'example/*.fasta' --genome example/Cav_10DC88.fasta -profile local,docker +nextflow run hoelzer/pocp -r 2.2.0 --genomes $HOME'/.nextflow/assets/hoelzer/pocp/example/*.fasta' --genome $HOME/.nextflow/assets/hoelzer/pocp/example/Cav_10DC88.fasta -profile local,docker ``` The final output (`pocp-matrix.tsv`) should look like this (here, the resulting TSV was imported into Numbers on MacOS): From 93479e12cf58157f705bdf914ae7948d8105b845 Mon Sep 17 00:00:00 2001 From: martin Date: Thu, 14 Dec 2023 22:11:36 +0100 Subject: [PATCH 4/4] improve README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index a087249..dc77205 100644 --- a/README.md +++ b/README.md @@ -56,3 +56,9 @@ adjusted: ``` Please note that per default an "all-vs-all" comparison is performed based on the provided FASTA files. However, you can also switch to an "one-vs-all" comparison by additionally providing a single genome FASTA via `--genome` next to the `--genomes` input **or** a single protein multi-FASTA via `--protein` next to the `--proteins` input. In both cases, only "one-vs-all" comparisons will be performed. It is also possible to combine `--genomes` with a target `--protein` FASTA for "one-vs-all" and vice versa. + +If you use the POCP Nextflow pipeline, please cite the original POCP study that introduced the metric and the POCP-nf pipeline: + +**[Qin, Qi-Long, _et al_. "A proposed genus boundary for the prokaryotes based on genomic insights." Journal of bacteriology 196.12 (2014): 2210-2215.](https://pubmed.ncbi.nlm.nih.gov/24706738/)** + +**[Martin Hölzer. "POCP: An automatic Nextflow pipeline for calculating the percentage of conserved proteins in bacterial taxonomy". SOME JOURNAL. Hopefully in 2024.]()**