diff --git a/CHANGELOG.md b/CHANGELOG.md index cb3b78e7a..18f790c09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Special thanks to the following for their contributions to the release: - [PR #1471](https://github.com/nf-core/rnaseq/pull/1471) - Fix prepare_genome subworkflow for sortmerna - [PR #1473](https://github.com/nf-core/rnaseq/pull/1473) - Bump STAR modules - [PR #1474](https://github.com/nf-core/rnaseq/pull/1474) - Bump versions to 3.18.0 +- [PR #1475](https://github.com/nf-core/rnaseq/pull/1475) - Fix log publishing around umitools/ umicollapse ## Parameters diff --git a/docs/output.md b/docs/output.md index 0bea48601..34326ea23 100644 --- a/docs/output.md +++ b/docs/output.md @@ -120,7 +120,7 @@ If multiple libraries/runs have been provided for the same sample in the input s -[UMI-tools](https://github.com/CGATOxford/UMI-tools) deduplicates reads based on unique molecular identifiers (UMIs) to address PCR-bias. Firstly, the UMI-tools `extract` command removes the UMI barcode information from the read sequence and adds it to the read name. Secondly, reads are deduplicated based on UMI identifier after mapping as highlighted in the [UMI-tools dedup](#umi-tools-dedup) section. +[UMI-tools](https://github.com/CGATOxford/UMI-tools) and [UMICollapse](https://github.com/Daniel-Liu-c0deb0t/UMICollapse) deduplicate reads based on unique molecular identifiers (UMIs) to address PCR-bias. Firstly, the UMI-tools `extract` command removes the UMI barcode information from the read sequence and adds it to the read name. Secondly, reads are deduplicated based on UMI identifier after mapping as highlighted in the [UMI dedup](#umi-dedup) section. To facilitate processing of input data which has the UMI barcode already embedded in the read name from the start, `--skip_umi_extract` can be specified in conjunction with `--with_umi`. @@ -305,7 +305,7 @@ The original BAM files generated by the selected alignment algorithm are further ![MultiQC - SAMtools mapped reads per contig plot](images/mqc_samtools_idxstats.png) -### UMI-tools dedup +### UMI dedup
Output files @@ -314,7 +314,7 @@ The original BAM files generated by the selected alignment algorithm are further - `.umi_dedup.sorted.bam`: If `--save_umi_intermeds` is specified the UMI deduplicated, coordinate sorted BAM file containing read alignments will be placed in this directory. - `.umi_dedup.sorted.bam.bai`: If `--save_umi_intermeds` is specified the BAI index file for the UMI deduplicated, coordinate sorted BAM file will be placed in this directory. - `.umi_dedup.sorted.bam.csi`: If `--save_umi_intermeds --bam_csi_index` is specified the CSI index file for the UMI deduplicated, coordinate sorted BAM file will be placed in this directory. -- `/umitools/` +- `/umitools/` (UMI-tools only) - `*_edit_distance.tsv`: Reports the (binned) average edit distance between the UMIs at each position. - `*_per_umi.tsv`: UMI-level summary statistics. - `*_per_umi_per_position.tsv`: Tabulates the counts for unique combinations of UMI and position. @@ -323,7 +323,7 @@ The content of the files above is explained in more detail in the [UMI-tools doc
-After extracting the UMI information from the read sequence (see [UMI-tools extract](#umi-tools-extract)), the second step in the removal of UMI barcodes involves deduplicating the reads based on both mapping and UMI barcode information using the UMI-tools `dedup` command. This will generate a filtered BAM file after the removal of PCR duplicates. +After extracting the UMI information from the read sequence (see [UMI-tools extract](#umi-tools-extract)), the second step in the removal of UMI barcodes involves deduplicating the reads based on both mapping and UMI barcode information. UMI deduplication can be carried out either with [UMI-tools](https://github.com/CGATOxford/UMI-tools) or [UMICollapse](https://github.com/Daniel-Liu-c0deb0t/UMICollapse), set via the `umi_dedup_tool` parameter. The output BAM files are the same, though UMI-tools has some additional outputs, as described above. Either method will generate a filtered BAM file after the removal of PCR duplicates. ### picard MarkDuplicates diff --git a/tests/.nftignore b/tests/.nftignore index 7f3fad699..8043747d3 100644 --- a/tests/.nftignore +++ b/tests/.nftignore @@ -32,7 +32,7 @@ umitools/*.umi_extract.log {hisat2,star_rsem,star_salmon}/stringtie/*.ballgown/t_data.ctab {hisat2,star_rsem,star_salmon}/stringtie/*.gene.abundance.txt {hisat2,star_rsem,star_salmon}/stringtie/*.{coverage,transcripts}.gtf -{hisat2,star_rsem,star_salmon}/umitools/genomic_dedup_log/*_UMICollapse.log +{hisat2,star_rsem,star_salmon}/{umitools,umicollapse}/{genomic,transcriptomic}_dedup_log/*.log {multiqc,multiqc/**}/multiqc_report.html {multiqc,multiqc/**}/multiqc_report_data/fastqc_{raw,trimmed}_top_overrepresented_sequences_table.txt {multiqc,multiqc/**}/multiqc_report_data/hisat2_pe_plot.txt diff --git a/tests/umi.nf.test b/tests/umi.nf.test index dba4c07fe..3a7083a28 100644 --- a/tests/umi.nf.test +++ b/tests/umi.nf.test @@ -15,6 +15,7 @@ nextflow_pipeline { umi_dedup_tool = 'umicollapse' aligner = 'hisat2' outdir = "$outputDir" + save_umi_intermeds = true } } @@ -49,6 +50,7 @@ nextflow_pipeline { umitools_dedup_stats = true skip_bbsplit = true outdir = "$outputDir" + save_umi_intermeds = true } } diff --git a/tests/umi.nf.test.snap b/tests/umi.nf.test.snap index 4e153a33b..1d0df4de3 100644 --- a/tests/umi.nf.test.snap +++ b/tests/umi.nf.test.snap @@ -612,6 +612,10 @@ "star_salmon/RAP1_IAA_30M_REP1", "star_salmon/RAP1_IAA_30M_REP1.umi_dedup.sorted.bam", "star_salmon/RAP1_IAA_30M_REP1.umi_dedup.sorted.bam.bai", + "star_salmon/RAP1_IAA_30M_REP1.umi_dedup.transcriptome.bam", + "star_salmon/RAP1_IAA_30M_REP1.umi_dedup.transcriptome.filtered.bam", + "star_salmon/RAP1_IAA_30M_REP1.umi_dedup.transcriptome.sorted.bam", + "star_salmon/RAP1_IAA_30M_REP1.umi_dedup.transcriptome.sorted.bam.bai", "star_salmon/RAP1_IAA_30M_REP1/aux_info", "star_salmon/RAP1_IAA_30M_REP1/aux_info/ambig_info.tsv", "star_salmon/RAP1_IAA_30M_REP1/aux_info/expected_bias.gz", @@ -629,6 +633,9 @@ "star_salmon/RAP1_UNINDUCED_REP1", "star_salmon/RAP1_UNINDUCED_REP1.umi_dedup.sorted.bam", "star_salmon/RAP1_UNINDUCED_REP1.umi_dedup.sorted.bam.bai", + "star_salmon/RAP1_UNINDUCED_REP1.umi_dedup.transcriptome.bam", + "star_salmon/RAP1_UNINDUCED_REP1.umi_dedup.transcriptome.sorted.bam", + "star_salmon/RAP1_UNINDUCED_REP1.umi_dedup.transcriptome.sorted.bam.bai", "star_salmon/RAP1_UNINDUCED_REP1/aux_info", "star_salmon/RAP1_UNINDUCED_REP1/aux_info/ambig_info.tsv", "star_salmon/RAP1_UNINDUCED_REP1/aux_info/expected_bias.gz", @@ -646,6 +653,9 @@ "star_salmon/RAP1_UNINDUCED_REP2", "star_salmon/RAP1_UNINDUCED_REP2.umi_dedup.sorted.bam", "star_salmon/RAP1_UNINDUCED_REP2.umi_dedup.sorted.bam.bai", + "star_salmon/RAP1_UNINDUCED_REP2.umi_dedup.transcriptome.bam", + "star_salmon/RAP1_UNINDUCED_REP2.umi_dedup.transcriptome.sorted.bam", + "star_salmon/RAP1_UNINDUCED_REP2.umi_dedup.transcriptome.sorted.bam.bai", "star_salmon/RAP1_UNINDUCED_REP2/aux_info", "star_salmon/RAP1_UNINDUCED_REP2/aux_info/ambig_info.tsv", "star_salmon/RAP1_UNINDUCED_REP2/aux_info/expected_bias.gz", @@ -663,6 +673,10 @@ "star_salmon/WT_REP1", "star_salmon/WT_REP1.umi_dedup.sorted.bam", "star_salmon/WT_REP1.umi_dedup.sorted.bam.bai", + "star_salmon/WT_REP1.umi_dedup.transcriptome.bam", + "star_salmon/WT_REP1.umi_dedup.transcriptome.filtered.bam", + "star_salmon/WT_REP1.umi_dedup.transcriptome.sorted.bam", + "star_salmon/WT_REP1.umi_dedup.transcriptome.sorted.bam.bai", "star_salmon/WT_REP1/aux_info", "star_salmon/WT_REP1/aux_info/ambig_info.tsv", "star_salmon/WT_REP1/aux_info/expected_bias.gz", @@ -680,6 +694,10 @@ "star_salmon/WT_REP2", "star_salmon/WT_REP2.umi_dedup.sorted.bam", "star_salmon/WT_REP2.umi_dedup.sorted.bam.bai", + "star_salmon/WT_REP2.umi_dedup.transcriptome.bam", + "star_salmon/WT_REP2.umi_dedup.transcriptome.filtered.bam", + "star_salmon/WT_REP2.umi_dedup.transcriptome.sorted.bam", + "star_salmon/WT_REP2.umi_dedup.transcriptome.sorted.bam.bai", "star_salmon/WT_REP2/aux_info", "star_salmon/WT_REP2/aux_info/ambig_info.tsv", "star_salmon/WT_REP2/aux_info/expected_bias.gz", @@ -1261,10 +1279,18 @@ "trimgalore/WT_REP2_trimmed_2.fastq.gz_trimming_report.txt", "umitools", "umitools/RAP1_IAA_30M_REP1.umi_extract.log", + "umitools/RAP1_IAA_30M_REP1.umi_extract_1.fastq.gz", + "umitools/RAP1_IAA_30M_REP1.umi_extract_2.fastq.gz", + "umitools/RAP1_UNINDUCED_REP1.umi_extract.fastq.gz", "umitools/RAP1_UNINDUCED_REP1.umi_extract.log", + "umitools/RAP1_UNINDUCED_REP2.umi_extract.fastq.gz", "umitools/RAP1_UNINDUCED_REP2.umi_extract.log", "umitools/WT_REP1.umi_extract.log", - "umitools/WT_REP2.umi_extract.log" + "umitools/WT_REP1.umi_extract_1.fastq.gz", + "umitools/WT_REP1.umi_extract_2.fastq.gz", + "umitools/WT_REP2.umi_extract.log", + "umitools/WT_REP2.umi_extract_1.fastq.gz", + "umitools/WT_REP2.umi_extract_2.fastq.gz" ], [ "genome_gfp.fasta:md5,e23e302af63736a199985a169fdac055", @@ -1467,14 +1493,22 @@ "WT_REP2.umi_dedup.sorted_per_umi_per_position.tsv:md5,6f5656947a7f0076df446e6f40430027", "WT_REP2.umi_dedup.transcriptome.sorted_edit_distance.tsv:md5,3e3c6a7e8996e566350742e9911366d3", "WT_REP2.umi_dedup.transcriptome.sorted_per_umi.tsv:md5,0c986c4cb7a77f650a19e2c454b9b179", - "WT_REP2.umi_dedup.transcriptome.sorted_per_umi_per_position.tsv:md5,af9028dbdab81de3854a32cd1d19ac8b" + "WT_REP2.umi_dedup.transcriptome.sorted_per_umi_per_position.tsv:md5,af9028dbdab81de3854a32cd1d19ac8b", + "RAP1_IAA_30M_REP1.umi_extract_1.fastq.gz:md5,e83d7f738fbbfaa541a2e71fe4663447", + "RAP1_IAA_30M_REP1.umi_extract_2.fastq.gz:md5,4f2873cbf584d6e84187238a4ae2b8fa", + "RAP1_UNINDUCED_REP1.umi_extract.fastq.gz:md5,9e42242fd68baac592140f63a8a716ce", + "RAP1_UNINDUCED_REP2.umi_extract.fastq.gz:md5,5a92b642927b8603c4765e5305e23e9c", + "WT_REP1.umi_extract_1.fastq.gz:md5,f312fac9c384a889ae4f959839263604", + "WT_REP1.umi_extract_2.fastq.gz:md5,ffca24924108fd54151620b7538b9e1a", + "WT_REP2.umi_extract_1.fastq.gz:md5,c3180451a24ce51fc35c1684521ae287", + "WT_REP2.umi_extract_2.fastq.gz:md5,067ff23f8d1307ad241cd70bc186b5c1" ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.2" + "nf-test": "0.9.2", + "nextflow": "24.10.3" }, - "timestamp": "2024-12-11T18:07:55.751564456" + "timestamp": "2024-12-20T00:02:04.611696704" }, "Params: --aligner hisat2 --umi_dedup_tool 'umicollapse'": { "content": [ @@ -2130,13 +2164,13 @@ "hisat2/stringtie/WT_REP2.coverage.gtf", "hisat2/stringtie/WT_REP2.gene.abundance.txt", "hisat2/stringtie/WT_REP2.transcripts.gtf", - "hisat2/umitools", - "hisat2/umitools/genomic_dedup_log", - "hisat2/umitools/genomic_dedup_log/RAP1_IAA_30M_REP1.umi_dedup.sorted_UMICollapse.log", - "hisat2/umitools/genomic_dedup_log/RAP1_UNINDUCED_REP1.umi_dedup.sorted_UMICollapse.log", - "hisat2/umitools/genomic_dedup_log/RAP1_UNINDUCED_REP2.umi_dedup.sorted_UMICollapse.log", - "hisat2/umitools/genomic_dedup_log/WT_REP1.umi_dedup.sorted_UMICollapse.log", - "hisat2/umitools/genomic_dedup_log/WT_REP2.umi_dedup.sorted_UMICollapse.log", + "hisat2/umicollapse", + "hisat2/umicollapse/genomic_dedup_log", + "hisat2/umicollapse/genomic_dedup_log/RAP1_IAA_30M_REP1.umi_dedup.sorted_UMICollapse.log", + "hisat2/umicollapse/genomic_dedup_log/RAP1_UNINDUCED_REP1.umi_dedup.sorted_UMICollapse.log", + "hisat2/umicollapse/genomic_dedup_log/RAP1_UNINDUCED_REP2.umi_dedup.sorted_UMICollapse.log", + "hisat2/umicollapse/genomic_dedup_log/WT_REP1.umi_dedup.sorted_UMICollapse.log", + "hisat2/umicollapse/genomic_dedup_log/WT_REP2.umi_dedup.sorted_UMICollapse.log", "multiqc", "multiqc/hisat2", "multiqc/hisat2/multiqc_report.html", @@ -2548,10 +2582,18 @@ "trimgalore/WT_REP2_trimmed_2.fastq.gz_trimming_report.txt", "umitools", "umitools/RAP1_IAA_30M_REP1.umi_extract.log", + "umitools/RAP1_IAA_30M_REP1.umi_extract_1.fastq.gz", + "umitools/RAP1_IAA_30M_REP1.umi_extract_2.fastq.gz", + "umitools/RAP1_UNINDUCED_REP1.umi_extract.fastq.gz", "umitools/RAP1_UNINDUCED_REP1.umi_extract.log", + "umitools/RAP1_UNINDUCED_REP2.umi_extract.fastq.gz", "umitools/RAP1_UNINDUCED_REP2.umi_extract.log", "umitools/WT_REP1.umi_extract.log", - "umitools/WT_REP2.umi_extract.log" + "umitools/WT_REP1.umi_extract_1.fastq.gz", + "umitools/WT_REP1.umi_extract_2.fastq.gz", + "umitools/WT_REP2.umi_extract.log", + "umitools/WT_REP2.umi_extract_1.fastq.gz", + "umitools/WT_REP2.umi_extract_2.fastq.gz" ], [ "genome_gfp.fasta:md5,e23e302af63736a199985a169fdac055", @@ -2688,14 +2730,22 @@ "cmd_info.json:md5,809380ddce725a8fab75dd7741b64bf6", "lib_format_counts.json:md5,d231ba7624b67eb654989f69530e2925", "R_sessionInfo.log:md5,fb0da0d7ad6994ed66a8e68348b19676", - "tx2gene.tsv:md5,0e2418a69d2eba45097ebffc2f700bfe" + "tx2gene.tsv:md5,0e2418a69d2eba45097ebffc2f700bfe", + "RAP1_IAA_30M_REP1.umi_extract_1.fastq.gz:md5,e83d7f738fbbfaa541a2e71fe4663447", + "RAP1_IAA_30M_REP1.umi_extract_2.fastq.gz:md5,4f2873cbf584d6e84187238a4ae2b8fa", + "RAP1_UNINDUCED_REP1.umi_extract.fastq.gz:md5,9e42242fd68baac592140f63a8a716ce", + "RAP1_UNINDUCED_REP2.umi_extract.fastq.gz:md5,5a92b642927b8603c4765e5305e23e9c", + "WT_REP1.umi_extract_1.fastq.gz:md5,f312fac9c384a889ae4f959839263604", + "WT_REP1.umi_extract_2.fastq.gz:md5,ffca24924108fd54151620b7538b9e1a", + "WT_REP2.umi_extract_1.fastq.gz:md5,c3180451a24ce51fc35c1684521ae287", + "WT_REP2.umi_extract_2.fastq.gz:md5,067ff23f8d1307ad241cd70bc186b5c1" ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.2" + "nf-test": "0.9.2", + "nextflow": "24.10.3" }, - "timestamp": "2024-12-11T18:01:45.228731692" + "timestamp": "2024-12-19T22:33:42.012684597" }, "--umi_dedup_tool 'umitools - stub": { "content": [ @@ -2804,9 +2854,9 @@ ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.2" + "nf-test": "0.9.2", + "nextflow": "24.10.3" }, - "timestamp": "2024-12-11T18:08:48.404716766" + "timestamp": "2024-12-19T23:28:01.570835895" } -} +} \ No newline at end of file diff --git a/workflows/rnaseq/nextflow.config b/workflows/rnaseq/nextflow.config index e7a6290fa..a621b3bbe 100644 --- a/workflows/rnaseq/nextflow.config +++ b/workflows/rnaseq/nextflow.config @@ -134,24 +134,49 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ] } + // Use the same umi_dedup prefix for umitools and umicollapse + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMI(COLLAPSE|TOOLS)_TRANSCRIPTOME:UMI(COLLAPSE|TOOLS_DEDUP)' { ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted" } + } + + // Publishing logic for umitools: + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP' { publishDir = [ [ - path: { "${params.outdir}/${params.aligner}/umitools" }, + path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir }, mode: params.publish_dir_mode, - pattern: '*.tsv' + pattern: '*.bam', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ], [ path: { "${params.outdir}/${params.aligner}/umitools/transcriptomic_dedup_log" }, mode: params.publish_dir_mode, pattern: '*.log' ], + [ + path: { "${params.outdir}/${params.aligner}/umitools" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + ] + } + + // Publishing logic for umicollapse + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_TRANSCRIPTOME:UMICOLLAPSE' { + publishDir = [ [ path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir }, mode: params.publish_dir_mode, pattern: '*.bam', saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ], + [ + path: { "${params.outdir}/${params.aligner}/umicollapse/transcriptomic_dedup_log" }, + mode: params.publish_dir_mode, + pattern: '*.log' ] ] } @@ -241,13 +266,21 @@ if (!params.skip_alignment) { ].join(' ').trim()} } + // Use the same umi_dedup prefix for umitools and umicollapse + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMI(COLLAPSE|TOOLS)_GENOME:UMI(COLLAPSE|TOOLS_DEDUP)' { ext.prefix = { "${meta.id}.umi_dedup.sorted" } + } + + // Publishing logic for umitools: + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME:UMITOOLS_DEDUP' { publishDir = [ [ - path: { "${params.outdir}/${params.aligner}/umitools" }, + path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir }, mode: params.publish_dir_mode, - pattern: '*.tsv' + pattern: '*.bam', + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ], [ path: { "${params.outdir}/${params.aligner}/umitools/genomic_dedup_log" }, @@ -255,10 +288,27 @@ if (!params.skip_alignment) { pattern: '*.log' ], [ - path: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir }, + path: { "${params.outdir}/${params.aligner}/umitools" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + ] + } + + // Publishing logic for umicollapse + + withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMICOLLAPSE_GENOME:UMICOLLAPSE' { + publishDir = [ + [ + path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir }, mode: params.publish_dir_mode, pattern: '*.bam', - saveAs: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? it : null } + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } + ], + [ + path: { "${params.outdir}/${params.aligner}/umicollapse/genomic_dedup_log" }, + mode: params.publish_dir_mode, + pattern: '*.log' ] ] } @@ -267,10 +317,10 @@ if (!params.skip_alignment) { ext.args = { params.bam_csi_index ? '-c' : '' } ext.prefix = { "${meta.id}.umi_dedup.sorted" } publishDir = [ - path: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir }, + path: { params.save_align_intermeds || params.save_umi_intermeds ? "${params.outdir}/${params.aligner}" : params.outdir }, mode: params.publish_dir_mode, pattern: '*.{bai,csi}', - saveAs: { params.save_align_intermeds || params.with_umi || params.save_umi_intermeds ? it : null } + saveAs: { params.save_align_intermeds || params.save_umi_intermeds ? it : null } ] }