From be2ed7a7447b530d49bd18d6e1db23a95b71a01b Mon Sep 17 00:00:00 2001 From: alexanderchang1 <35378211+alexanderchang1@users.noreply.github.com> Date: Wed, 4 Sep 2024 12:59:16 -0400 Subject: [PATCH 01/10] Adding new module asmultipcf adding new module asmultipcf to modify ascat to work on multisample contexts --- modules/nf-core/asmultipcf/environment.yml | 8 ++ modules/nf-core/asmultipcf/main.nf | 68 ++++++++++++ modules/nf-core/asmultipcf/meta.yml | 119 +++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 modules/nf-core/asmultipcf/environment.yml create mode 100644 modules/nf-core/asmultipcf/main.nf create mode 100644 modules/nf-core/asmultipcf/meta.yml diff --git a/modules/nf-core/asmultipcf/environment.yml b/modules/nf-core/asmultipcf/environment.yml new file mode 100644 index 0000000000..c436b22373 --- /dev/null +++ b/modules/nf-core/asmultipcf/environment.yml @@ -0,0 +1,8 @@ +name: asmultipcf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ascat=3.1.1 + - bioconda::cancerit-allelecount=4.3.0 diff --git a/modules/nf-core/asmultipcf/main.nf b/modules/nf-core/asmultipcf/main.nf new file mode 100644 index 0000000000..498eb8f3ae --- /dev/null +++ b/modules/nf-core/asmultipcf/main.nf @@ -0,0 +1,68 @@ +process ASMULTIPCF { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:ba3e6d2157eac2d38d22e62ec87675e12adb1010-0': + 'biocontainers/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:ba3e6d2157eac2d38d22e62ec87675e12adb1010-0' }" + + input: + tuple val(meta), path(tumor_logr_files) + tuple val(meta), path(tumor_baf_files) + tuple val(meta), path(normal_logr_file) + tuple val(meta), path(normal_baf_file) + + output: + tuple val(meta), path("*_asmultipcf_segments.txt"), emit: asmultipcf_segments + tuple val(meta), path("*_asmultipcf_purityploidy.txt"), emit: asmultipcf_purityploidy + path "versions.yml", emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + library(ASCAT) + + # Concatenate tumor LogR files + tumor_logr_data <- do.call(cbind, lapply(strsplit("${tumor_logr_files}", " "), function(file) { + read.table(file, header = TRUE, check.names = FALSE) + })) + write.table(tumor_logr_data, file = "combined_tumor_logr.txt", sep = "\t", quote = FALSE, row.names = FALSE) + + # Concatenate tumor BAF files + tumor_baf_data <- do.call(cbind, lapply(strsplit("${tumor_baf_files}", " "), function(file) { + read.table(file, header = TRUE, check.names = FALSE) + })) + write.table(tumor_baf_data, file = "combined_tumor_baf.txt", sep = "\t", quote = FALSE, row.names = FALSE) + + # Load the data + ascat.bc <- ascat.loadData( + Tumor_LogR_file = "combined_tumor_logr.txt", + Tumor_BAF_file = "combined_tumor_baf.txt", + Germline_LogR_file = "$normal_logr_file", + Germline_BAF_file = "$normal_baf_file" + ) + + # Run multi-sample segmentation + ascat.bc <- ascat.asmultipcf(ascat.bc, penalty = ${params.ascat_asmultipcf_penalty ?: 5}) + + # Run ASCAT + ascat.output <- ascat.runAscat(ascat.bc) + + # Write out segmented regions + write.table(ascat.output[["segments"]], file="${prefix}_asmultipcf_segments.txt", sep="\t", quote=FALSE, row.names=FALSE) + + # Write out purity and ploidy info + purity_ploidy <- data.frame( + Sample = names(ascat.output\$aberrantcellfraction), + Purity = unlist(ascat.output\$aberrantcellfraction), + Ploidy = unlist(ascat.output\$ploidy) + ) + write.table(purity_ploidy, file="${prefix}_asmultipcf_purityploidy.txt", sep="\t", quote=FALSE, row.names=FALSE) + + # Version export + writeLines(c("\\"${task.process}\\":", paste0(" ascat: ", packageVersion("ASCAT"))), "versions.yml") + """ +} \ No newline at end of file diff --git a/modules/nf-core/asmultipcf/meta.yml b/modules/nf-core/asmultipcf/meta.yml new file mode 100644 index 0000000000..db5d5b85c5 --- /dev/null +++ b/modules/nf-core/asmultipcf/meta.yml @@ -0,0 +1,119 @@ +name: asmultipcf +description: Performs multi-sample segmentation using ASCAT +keywords: + - bam + - copy number + - cram +tools: + - ascat: + description: ASCAT is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. ASCAT infers tumour purity (the fraction of tumour cells) and ploidy (the amount of DNA per tumour cell), expressed as multiples of haploid genomes from SNP array or massively parallel sequencing data, and calculates whole-genome allele-specific copy number profiles (the number of copies of both parental alleles for all SNP loci across the genome). + documentation: https://github.com/VanLoo-lab/ascat/tree/master/man + tool_dev_url: https://github.com/VanLoo-lab/ascat + doi: "10.1093/bioinformatics/btaa538" + licence: ["GPL v3"] +input: + - args: + type: map + description: | + Groovy Map containing tool parameters. MUST follow the structure/keywords below and be provided via modules.config. Parameters must be set between quotes. (optional) parameters can be removed from the map, if they are not set. For default values, please check the documentation above. + + ``` + { + [ + "gender": "XX", + "genomeVersion": "hg19" + "purity": (optional), + "ploidy": (optional), + "gc_files": (optional), + "minCounts": (optional), + "BED_file": (optional) but recommended for WES, + "chrom_names": (optional), + "min_base_qual": (optional), + "min_map_qual": (optional), + "ref_fasta": (optional), + "skip_allele_counting_tumour": (optional), + "skip_allele_counting_normal": (optional) + ] + } + ``` + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_normal: + type: file + description: BAM/CRAM file, must adhere to chr1, chr2, ...chrX notation For modifying chromosome notation in bam files please follow https://josephcckuo.wordpress.com/2016/11/17/modify-chromosome-notation-in-bam-file/. + pattern: "*.{bam,cram}" + - index_normal: + type: file + description: index for normal_bam/cram + pattern: "*.{bai,crai}" + - input_tumor: + type: file + description: BAM/CRAM file, must adhere to chr1, chr2, ...chrX notation + pattern: "*.{bam,cram}" + - index_tumor: + type: file + description: index for tumor_bam/cram + pattern: "*.{bai,crai}" + - allele_files: + type: file + description: allele files for ASCAT WGS. Can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS + - loci_files: + type: file + description: loci files for ASCAT WGS. Loci files without chromosome notation can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS Make sure the chromosome notation matches the bam/cram input files. To add the chromosome notation to loci files (hg19/hg38) if necessary, you can run this command `if [[ $(samtools view | head -n1 | cut -f3)\" == *\"chr\"* ]]; then for i in {1..22} X; do sed -i 's/^/chr/' G1000_loci_hg19_chr_${i}.txt; done; fi` + - bed_file: + type: file + description: Bed file for ASCAT WES (optional, but recommended for WES) + - fasta: + type: file + description: Reference fasta file (optional) + - gc_file: + type: file + description: GC correction file (optional) - Used to do logR correction of the tumour sample(s) with genomic GC content + - rt_file: + type: file + description: replication timing correction file (optional, provide only in combination with gc_file) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - allelefreqs: + type: file + description: Files containing allee frequencies per chromosome + pattern: "*{alleleFrequencies_chr*.txt}" + - metrics: + type: file + description: File containing quality metrics + pattern: "*.{metrics.txt}" + - png: + type: file + description: ASCAT plots + pattern: "*.{png}" + - purityploidy: + type: file + description: File with purity and ploidy data + pattern: "*.{purityploidy.txt}" + - segments: + type: file + description: File with multi-sample segments data + pattern: "*.{asmultipcf_segments.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@aasNGC" + - "@lassefolkersen" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" +maintainers: + - "@aasNGC" + - "@lassefolkersen" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" From 7d6eb34ed84ba3e501d0bcb32e6a70af6b9a315c Mon Sep 17 00:00:00 2001 From: alexanderchang1 <35378211+alexanderchang1@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:01:34 -0400 Subject: [PATCH 02/10] Update main.nf Adding change that includes both a new parameter for asmultipcf and passes it to the existing ascat subworkflow grouped by patient. --- .../local/bam_variant_calling_somatic_all/main.nf | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/bam_variant_calling_somatic_all/main.nf b/subworkflows/local/bam_variant_calling_somatic_all/main.nf index cdfabfc3ac..8878f2bd6f 100644 --- a/subworkflows/local/bam_variant_calling_somatic_all/main.nf +++ b/subworkflows/local/bam_variant_calling_somatic_all/main.nf @@ -7,6 +7,7 @@ include { BAM_VARIANT_CALLING_FREEBAYES } from '../bam_variant_c include { BAM_VARIANT_CALLING_MPILEUP as MPILEUP_NORMAL } from '../bam_variant_calling_mpileup/main' include { BAM_VARIANT_CALLING_MPILEUP as MPILEUP_TUMOR } from '../bam_variant_calling_mpileup/main' include { BAM_VARIANT_CALLING_SOMATIC_ASCAT } from '../bam_variant_calling_somatic_ascat/main' + include { BAM_VARIANT_CALLING_SOMATIC_CONTROLFREEC } from '../bam_variant_calling_somatic_controlfreec/main' include { BAM_VARIANT_CALLING_SOMATIC_MANTA } from '../bam_variant_calling_somatic_manta/main' include { BAM_VARIANT_CALLING_SOMATIC_MUTECT2 } from '../bam_variant_calling_somatic_mutect2/main' @@ -41,6 +42,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { gc_file // channel: [optional] ascat gc content file rt_file // channel: [optional] ascat rt file joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + asmultipcf // boolean: [mandatory] [default: false] run ascat in multi-sample mode wes // boolean: [mandatory] [default: false] whether targeted data is processed main: @@ -53,16 +55,23 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL { out_msisensorpro = Channel.empty() vcf_mutect2 = Channel.empty() vcf_tiddit = Channel.empty() - + // ASCAT if (tools.split(',').contains('ascat')) { BAM_VARIANT_CALLING_SOMATIC_ASCAT( - cram, + // Remap channel to match module/subworkflow + // Adjust meta.map to handle both regular and asmultipcf modes + cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> + params.asmultipcf ? + [ meta + [ id:meta.patient ], normal_cram, normal_crai, tumor_cram, tumor_crai ] : + [ meta, normal_cram, normal_crai, tumor_cram, tumor_crai ] + }, allele_files, loci_files, (wes ? intervals_bed_combined : []), // No intervals needed if not WES fasta.map{ meta, fasta -> [ fasta ] }, gc_file, - rt_file + rt_file, + params.asmultipcf // Pass asmultipcf parameter to the process ) versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_ASCAT.out.versions) From 3f7bad256a2d166666f80a3a2a60f8f9518e4a87 Mon Sep 17 00:00:00 2001 From: alexanderchang1 <35378211+alexanderchang1@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:04:23 -0400 Subject: [PATCH 03/10] Update main.nf Drafting changes to the ASCAT module to layer asmultipcf downstream once ascat is done running. --- .../bam_variant_calling_somatic_ascat/main.nf | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf index 22802cfb58..d0cc4985a9 100644 --- a/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf +++ b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf @@ -5,6 +5,7 @@ // A when clause condition is defined in the conf/modules.config to determine if the module should be run include { ASCAT } from '../../../modules/nf-core/ascat/main' +include { ASMULTIPCF } from '../../../modules/nf-core/asmultipcf/main' workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { @@ -16,6 +17,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { fasta // channel: [optional] fasta needed for cram gc_file // channel: [optional] txt for LogRCorrection rt_file // channel: [optional] txt for LogRCorrection + asmultipcf // boolean: [mandatory] whether to run ASMULTIPCF main: @@ -25,6 +27,36 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { ch_versions = ch_versions.mix(ASCAT.out.versions) + if (asmultipcf) { + // Group ASCAT outputs by patient + tumor_logr_by_patient = ASCAT.out.logrs.map { meta, file -> [meta.patient, meta, file] } + .groupTuple(by: 0) + .map { patient, metas, files -> [metas[0] + [id:patient], files] } + + tumor_baf_by_patient = ASCAT.out.bafs.map { meta, file -> [meta.patient, meta, file] } + .groupTuple(by: 0) + .map { patient, metas, files -> [metas[0] + [id:patient], files] } + + // Assuming normal samples are the same for all tumors of a patient + normal_logr_by_patient = ASCAT.out.logrs.map { meta, file -> [meta.patient, meta, file] } + .groupTuple(by: 0) + .map { patient, metas, files -> [metas[0] + [id:patient], files[0]] } + + normal_baf_by_patient = ASCAT.out.bafs.map { meta, file -> [meta.patient, meta, file] } + .groupTuple(by: 0) + .map { patient, metas, files -> [metas[0] + [id:patient], files[0]] } + + // Combine all inputs for ASMULTIPCF + asmultipcf_input = tumor_logr_by_patient.join(tumor_baf_by_patient) + .join(normal_logr_by_patient) + .join(normal_baf_by_patient) + + // Run ASMULTIPCF + ASMULTIPCF(asmultipcf_input) + + ch_versions = ch_versions.mix(ASMULTIPCF.out.versions) + } + emit: versions = ch_versions } From e0cea247d6241a173ba45529585c1688addc055a Mon Sep 17 00:00:00 2001 From: alexanderchang1 <35378211+alexanderchang1@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:12:33 -0400 Subject: [PATCH 04/10] Update main.nf Making changes to have it use nextflow to run all samples ascat first before proceeding with multi, drawing inspiration from the joint_mutect2 implementation --- .../bam_variant_calling_somatic_ascat/main.nf | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf index d0cc4985a9..f6570352b2 100644 --- a/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf +++ b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf @@ -8,7 +8,6 @@ include { ASCAT } from '../../../modules/nf-core/ascat/main' include { ASMULTIPCF } from '../../../modules/nf-core/asmultipcf/main' workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { - take: cram_pair // channel: [mandatory] [meta, normal_cram, normal_crai, tumor_cram, tumor_crai] allele_files // channel: [mandatory] zip @@ -20,36 +19,33 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { asmultipcf // boolean: [mandatory] whether to run ASMULTIPCF main: - ch_versions = Channel.empty() + // Group input by patient + cram_pair_by_patient = cram_pair + .map { meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> + [meta.patient, meta, normal_cram, normal_crai, tumor_cram, tumor_crai] + } + .groupTuple() + + // Run ASCAT for all samples ASCAT(cram_pair, allele_files, loci_files, intervals_bed, fasta, gc_file, rt_file) - ch_versions = ch_versions.mix(ASCAT.out.versions) + // Group ASCAT outputs by patient + ascat_output_by_patient = ASCAT.out.logrs.join(ASCAT.out.bafs) + .map { meta, logr, baf -> [meta.patient, meta, logr, baf] } + .groupTuple() if (asmultipcf) { - // Group ASCAT outputs by patient - tumor_logr_by_patient = ASCAT.out.logrs.map { meta, file -> [meta.patient, meta, file] } - .groupTuple(by: 0) - .map { patient, metas, files -> [metas[0] + [id:patient], files] } - - tumor_baf_by_patient = ASCAT.out.bafs.map { meta, file -> [meta.patient, meta, file] } - .groupTuple(by: 0) - .map { patient, metas, files -> [metas[0] + [id:patient], files] } - - // Assuming normal samples are the same for all tumors of a patient - normal_logr_by_patient = ASCAT.out.logrs.map { meta, file -> [meta.patient, meta, file] } - .groupTuple(by: 0) - .map { patient, metas, files -> [metas[0] + [id:patient], files[0]] } - - normal_baf_by_patient = ASCAT.out.bafs.map { meta, file -> [meta.patient, meta, file] } - .groupTuple(by: 0) - .map { patient, metas, files -> [metas[0] + [id:patient], files[0]] } - - // Combine all inputs for ASMULTIPCF - asmultipcf_input = tumor_logr_by_patient.join(tumor_baf_by_patient) - .join(normal_logr_by_patient) - .join(normal_baf_by_patient) + // Prepare input for ASMULTIPCF + asmultipcf_input = ascat_output_by_patient + .map { patient, metas, logrs, bafs -> + def tumor_logrs = logrs.findAll { it.name.contains('tumor') } + def tumor_bafs = bafs.findAll { it.name.contains('tumor') } + def normal_logr = logrs.find { it.name.contains('normal') } + def normal_baf = bafs.find { it.name.contains('normal') } + [metas[0] + [id: patient], tumor_logrs, tumor_bafs, normal_logr, normal_baf] + } // Run ASMULTIPCF ASMULTIPCF(asmultipcf_input) @@ -57,6 +53,12 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { ch_versions = ch_versions.mix(ASMULTIPCF.out.versions) } + ch_versions = ch_versions.mix(ASCAT.out.versions) + emit: + ascat_segments = ASCAT.out.segments + ascat_purityploidy = ASCAT.out.purityploidy + asmultipcf_segments = asmultipcf ? ASMULTIPCF.out.asmultipcf_segments : Channel.empty() + asmultipcf_purityploidy = asmultipcf ? ASMULTIPCF.out.asmultipcf_purityploidy : Channel.empty() versions = ch_versions } From 0ba3dd46b89637131c845b1f12d5dd5527a5c7cf Mon Sep 17 00:00:00 2001 From: alc376 Date: Wed, 4 Sep 2024 14:40:21 -0400 Subject: [PATCH 05/10] Committing changes that corrected a missing variable on HTC, to be pulled back to PSC to use my own test set there --- .../tools_somatic_ascat_asmultipcf.config | 24 +++++++++++++++++++ nextflow.config | 1 + .../main.nf | 1 + tests/csv/3.0/ascat_somatic_asmultipcf.csv | 4 ++++ workflows/sarek/main.nf | 1 + 5 files changed, 31 insertions(+) create mode 100644 conf/test/tools_somatic_ascat_asmultipcf.config create mode 100644 tests/csv/3.0/ascat_somatic_asmultipcf.csv diff --git a/conf/test/tools_somatic_ascat_asmultipcf.config b/conf/test/tools_somatic_ascat_asmultipcf.config new file mode 100644 index 0000000000..ec6df8e483 --- /dev/null +++ b/conf/test/tools_somatic_ascat_asmultipcf.config @@ -0,0 +1,24 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/ascat_somatic.csv" + genome = 'GATK.GRCh37' + germline_resource_tbi = "${params.modules_testdata_base_path}/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi" + ascat_loci = "G1000_loci_hg19.zip" + ascat_min_base_qual = 30 + chr_dir = "${params.modules_testdata_base_path}/genomics/homo_sapiens/genome/chr21/sequence/chromosomes.tar.gz" + germline_resource = "${params.modules_testdata_base_path}/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz" + intervals = "${params.modules_testdata_base_path}/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed" + step = 'variant_calling' + tools = 'ascat' + wes = false +} diff --git a/nextflow.config b/nextflow.config index de95ae8c86..6f20d5232d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,6 +71,7 @@ params { ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + asmultipcf = false // if true, enables multiple sample copy number calling with ASCAT only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired sample sentieon_dnascope_emit_mode = 'variant' // default value for Sentieon dnascope sentieon_dnascope_pcr_indel_model = 'CONSERVATIVE' diff --git a/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf index 59b14ed898..30b5809a13 100644 --- a/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf +++ b/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf @@ -35,6 +35,7 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_ALL { panel_of_normals // channel: [optional] panel_of_normals panel_of_normals_tbi // channel: [optional] panel_of_normals_tbi joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + asmultipcf // boolean: [mandatory] [default: false] run ascat in multi-sample mode wes // boolean: [mandatory] [default: false] whether targeted data is processed main: diff --git a/tests/csv/3.0/ascat_somatic_asmultipcf.csv b/tests/csv/3.0/ascat_somatic_asmultipcf.csv new file mode 100644 index 0000000000..012b1a26f0 --- /dev/null +++ b/tests/csv/3.0/ascat_somatic_asmultipcf.csv @@ -0,0 +1,4 @@ +patient,sex,status,sample,cram,crai +test3,XX,0,sample3,HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram,HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai +test3,XX,1,sample4,HG00146.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram,HG00146.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai +test3,XX,1,sample5,HG00147.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram,HG00147.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 90307f19c2..3cbed1b0d0 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -825,6 +825,7 @@ workflow SAREK { gc_file, rt_file, params.joint_mutect2, + params.asmultipcf, params.wes ) From 1fc5591733b6b7320619e69c8cfb0aa2a38da14d Mon Sep 17 00:00:00 2001 From: Alexander Chih-Chieh Chang Date: Wed, 4 Sep 2024 15:41:22 -0400 Subject: [PATCH 06/10] Added and double checked based on vignette for asmultipcf - having trouble testing --- conf/test/tools_somatic_ascat_asmultipcf.config | 4 ++-- modules/nf-core/asmultipcf/main.nf | 6 ++---- nextflow.config | 1 + .../local/bam_variant_calling_somatic_ascat/main.nf | 5 +++-- .../local/bam_variant_calling_tumor_only_all/main.nf | 1 - 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/conf/test/tools_somatic_ascat_asmultipcf.config b/conf/test/tools_somatic_ascat_asmultipcf.config index ec6df8e483..ea3402f528 100644 --- a/conf/test/tools_somatic_ascat_asmultipcf.config +++ b/conf/test/tools_somatic_ascat_asmultipcf.config @@ -10,10 +10,10 @@ */ params { - input = "${projectDir}/tests/csv/3.0/ascat_somatic.csv" + input = "${projectDir}/tests/csv/3.0/ascat_somatic_asmultipcf.csv" genome = 'GATK.GRCh37' germline_resource_tbi = "${params.modules_testdata_base_path}/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi" - ascat_loci = "G1000_loci_hg19.zip" + ascat_loci = "G1000_loci_hg38.zip" ascat_min_base_qual = 30 chr_dir = "${params.modules_testdata_base_path}/genomics/homo_sapiens/genome/chr21/sequence/chromosomes.tar.gz" germline_resource = "${params.modules_testdata_base_path}/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz" diff --git a/modules/nf-core/asmultipcf/main.nf b/modules/nf-core/asmultipcf/main.nf index 498eb8f3ae..b0e24d0e7b 100644 --- a/modules/nf-core/asmultipcf/main.nf +++ b/modules/nf-core/asmultipcf/main.nf @@ -8,10 +8,8 @@ process ASMULTIPCF { 'biocontainers/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:ba3e6d2157eac2d38d22e62ec87675e12adb1010-0' }" input: - tuple val(meta), path(tumor_logr_files) - tuple val(meta), path(tumor_baf_files) - tuple val(meta), path(normal_logr_file) - tuple val(meta), path(normal_baf_file) + tuple val(meta), path(tumor_logr_files), path(tumor_baf_files), path(normal_logr_file), path(normal_baf_file) + output: tuple val(meta), path("*_asmultipcf_segments.txt"), emit: asmultipcf_segments diff --git a/nextflow.config b/nextflow.config index 6f20d5232d..2db4b044e8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -320,6 +320,7 @@ profiles { tools_somatic { includeConfig 'conf/test/tools_somatic.config' } tools_somatic_ascat { includeConfig 'conf/test/tools_somatic_ascat.config' } tools_tumoronly { includeConfig 'conf/test/tools_tumoronly.config' } + tools_somatic_asmultipcf { includeConfig 'conf/test/tools_somatic_ascat_asmultipcf.config' } trimming { includeConfig 'conf/test/trimming.config' } umi { includeConfig 'conf/test/umi.config' } use_gatk_spark { includeConfig 'conf/test/use_gatk_spark.config' } diff --git a/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf index f6570352b2..e03b747e65 100644 --- a/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf +++ b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf @@ -36,15 +36,16 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { .map { meta, logr, baf -> [meta.patient, meta, logr, baf] } .groupTuple() - if (asmultipcf) { + if (params.asmultipcf) { // Prepare input for ASMULTIPCF asmultipcf_input = ascat_output_by_patient .map { patient, metas, logrs, bafs -> + def meta = metas[0] + [id: patient] def tumor_logrs = logrs.findAll { it.name.contains('tumor') } def tumor_bafs = bafs.findAll { it.name.contains('tumor') } def normal_logr = logrs.find { it.name.contains('normal') } def normal_baf = bafs.find { it.name.contains('normal') } - [metas[0] + [id: patient], tumor_logrs, tumor_bafs, normal_logr, normal_baf] + [meta, tumor_logrs, tumor_bafs, normal_logr, normal_baf] } // Run ASMULTIPCF diff --git a/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf index 30b5809a13..59b14ed898 100644 --- a/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf +++ b/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf @@ -35,7 +35,6 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_ALL { panel_of_normals // channel: [optional] panel_of_normals panel_of_normals_tbi // channel: [optional] panel_of_normals_tbi joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode - asmultipcf // boolean: [mandatory] [default: false] run ascat in multi-sample mode wes // boolean: [mandatory] [default: false] whether targeted data is processed main: From 1d3f43deace277011977ab180814baf50c46c205 Mon Sep 17 00:00:00 2001 From: alexanderchang1 <35378211+alexanderchang1@users.noreply.github.com> Date: Wed, 4 Sep 2024 15:55:26 -0400 Subject: [PATCH 07/10] Update CHANGELOG.md Updating change log for ASCAT asmultipcf --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b55823dc8..c5c193435f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - [#1638](https://github.com/nf-core/sarek/pull/1638) - Added additional documentation detailing ASCAT WES usage. +- [] Added asmultipcf functionality for multisample ASCAT calls. ## [3.4.3](https://github.com/nf-core/sarek/releases/tag/3.4.3) - Loametjåhkkå From cb5067f8bd5a04266216ef09b7c6e082df3aacca Mon Sep 17 00:00:00 2001 From: alexanderchang1 <35378211+alexanderchang1@users.noreply.github.com> Date: Wed, 4 Sep 2024 16:00:12 -0400 Subject: [PATCH 08/10] Update output.md Updating relevant usage docs for asmultipcf implementation --- docs/output.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/output.md b/docs/output.md index 7f8455f95d..1a6881ea21 100644 --- a/docs/output.md +++ b/docs/output.md @@ -692,6 +692,13 @@ The output is a tab delimited text file with the following columns: The file `.cnvs.txt` contains all segments predicted by ASCAT, both those with normal copy number (nMinor = 1 and nMajor =1) and those corresponding to copy number aberrations. +--asmultipcf if this is turned on. It will run local module asmultipcf which corrects segments calls on multiple samples from the same patient. This will give you two additional output files + +- `._asmultipcf_purityploidy.txt` + - file with information about purity and ploidy corrected for multiple samples +- `._asmultipcf_segments.txt` + - file with information about copy number segments corrected for multiple samples + #### CNVKit From 5e91fb35c027abf71310be6dc572622c7a85e1d2 Mon Sep 17 00:00:00 2001 From: Alexander Chih-Chieh Chang Date: Wed, 4 Sep 2024 16:14:47 -0400 Subject: [PATCH 09/10] Fixed changes after run nf-core schema and nf-core lint --- modules/local/asmultipcf/environment.yml | 8 ++ modules/local/asmultipcf/main.nf | 66 +++++++++++++ modules/local/asmultipcf/meta.yml | 119 +++++++++++++++++++++++ nextflow_schema.json | 12 ++- 4 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 modules/local/asmultipcf/environment.yml create mode 100644 modules/local/asmultipcf/main.nf create mode 100644 modules/local/asmultipcf/meta.yml diff --git a/modules/local/asmultipcf/environment.yml b/modules/local/asmultipcf/environment.yml new file mode 100644 index 0000000000..c436b22373 --- /dev/null +++ b/modules/local/asmultipcf/environment.yml @@ -0,0 +1,8 @@ +name: asmultipcf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ascat=3.1.1 + - bioconda::cancerit-allelecount=4.3.0 diff --git a/modules/local/asmultipcf/main.nf b/modules/local/asmultipcf/main.nf new file mode 100644 index 0000000000..b0e24d0e7b --- /dev/null +++ b/modules/local/asmultipcf/main.nf @@ -0,0 +1,66 @@ +process ASMULTIPCF { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:ba3e6d2157eac2d38d22e62ec87675e12adb1010-0': + 'biocontainers/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:ba3e6d2157eac2d38d22e62ec87675e12adb1010-0' }" + + input: + tuple val(meta), path(tumor_logr_files), path(tumor_baf_files), path(normal_logr_file), path(normal_baf_file) + + + output: + tuple val(meta), path("*_asmultipcf_segments.txt"), emit: asmultipcf_segments + tuple val(meta), path("*_asmultipcf_purityploidy.txt"), emit: asmultipcf_purityploidy + path "versions.yml", emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + library(ASCAT) + + # Concatenate tumor LogR files + tumor_logr_data <- do.call(cbind, lapply(strsplit("${tumor_logr_files}", " "), function(file) { + read.table(file, header = TRUE, check.names = FALSE) + })) + write.table(tumor_logr_data, file = "combined_tumor_logr.txt", sep = "\t", quote = FALSE, row.names = FALSE) + + # Concatenate tumor BAF files + tumor_baf_data <- do.call(cbind, lapply(strsplit("${tumor_baf_files}", " "), function(file) { + read.table(file, header = TRUE, check.names = FALSE) + })) + write.table(tumor_baf_data, file = "combined_tumor_baf.txt", sep = "\t", quote = FALSE, row.names = FALSE) + + # Load the data + ascat.bc <- ascat.loadData( + Tumor_LogR_file = "combined_tumor_logr.txt", + Tumor_BAF_file = "combined_tumor_baf.txt", + Germline_LogR_file = "$normal_logr_file", + Germline_BAF_file = "$normal_baf_file" + ) + + # Run multi-sample segmentation + ascat.bc <- ascat.asmultipcf(ascat.bc, penalty = ${params.ascat_asmultipcf_penalty ?: 5}) + + # Run ASCAT + ascat.output <- ascat.runAscat(ascat.bc) + + # Write out segmented regions + write.table(ascat.output[["segments"]], file="${prefix}_asmultipcf_segments.txt", sep="\t", quote=FALSE, row.names=FALSE) + + # Write out purity and ploidy info + purity_ploidy <- data.frame( + Sample = names(ascat.output\$aberrantcellfraction), + Purity = unlist(ascat.output\$aberrantcellfraction), + Ploidy = unlist(ascat.output\$ploidy) + ) + write.table(purity_ploidy, file="${prefix}_asmultipcf_purityploidy.txt", sep="\t", quote=FALSE, row.names=FALSE) + + # Version export + writeLines(c("\\"${task.process}\\":", paste0(" ascat: ", packageVersion("ASCAT"))), "versions.yml") + """ +} \ No newline at end of file diff --git a/modules/local/asmultipcf/meta.yml b/modules/local/asmultipcf/meta.yml new file mode 100644 index 0000000000..db5d5b85c5 --- /dev/null +++ b/modules/local/asmultipcf/meta.yml @@ -0,0 +1,119 @@ +name: asmultipcf +description: Performs multi-sample segmentation using ASCAT +keywords: + - bam + - copy number + - cram +tools: + - ascat: + description: ASCAT is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. ASCAT infers tumour purity (the fraction of tumour cells) and ploidy (the amount of DNA per tumour cell), expressed as multiples of haploid genomes from SNP array or massively parallel sequencing data, and calculates whole-genome allele-specific copy number profiles (the number of copies of both parental alleles for all SNP loci across the genome). + documentation: https://github.com/VanLoo-lab/ascat/tree/master/man + tool_dev_url: https://github.com/VanLoo-lab/ascat + doi: "10.1093/bioinformatics/btaa538" + licence: ["GPL v3"] +input: + - args: + type: map + description: | + Groovy Map containing tool parameters. MUST follow the structure/keywords below and be provided via modules.config. Parameters must be set between quotes. (optional) parameters can be removed from the map, if they are not set. For default values, please check the documentation above. + + ``` + { + [ + "gender": "XX", + "genomeVersion": "hg19" + "purity": (optional), + "ploidy": (optional), + "gc_files": (optional), + "minCounts": (optional), + "BED_file": (optional) but recommended for WES, + "chrom_names": (optional), + "min_base_qual": (optional), + "min_map_qual": (optional), + "ref_fasta": (optional), + "skip_allele_counting_tumour": (optional), + "skip_allele_counting_normal": (optional) + ] + } + ``` + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_normal: + type: file + description: BAM/CRAM file, must adhere to chr1, chr2, ...chrX notation For modifying chromosome notation in bam files please follow https://josephcckuo.wordpress.com/2016/11/17/modify-chromosome-notation-in-bam-file/. + pattern: "*.{bam,cram}" + - index_normal: + type: file + description: index for normal_bam/cram + pattern: "*.{bai,crai}" + - input_tumor: + type: file + description: BAM/CRAM file, must adhere to chr1, chr2, ...chrX notation + pattern: "*.{bam,cram}" + - index_tumor: + type: file + description: index for tumor_bam/cram + pattern: "*.{bai,crai}" + - allele_files: + type: file + description: allele files for ASCAT WGS. Can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS + - loci_files: + type: file + description: loci files for ASCAT WGS. Loci files without chromosome notation can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS Make sure the chromosome notation matches the bam/cram input files. To add the chromosome notation to loci files (hg19/hg38) if necessary, you can run this command `if [[ $(samtools view | head -n1 | cut -f3)\" == *\"chr\"* ]]; then for i in {1..22} X; do sed -i 's/^/chr/' G1000_loci_hg19_chr_${i}.txt; done; fi` + - bed_file: + type: file + description: Bed file for ASCAT WES (optional, but recommended for WES) + - fasta: + type: file + description: Reference fasta file (optional) + - gc_file: + type: file + description: GC correction file (optional) - Used to do logR correction of the tumour sample(s) with genomic GC content + - rt_file: + type: file + description: replication timing correction file (optional, provide only in combination with gc_file) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - allelefreqs: + type: file + description: Files containing allee frequencies per chromosome + pattern: "*{alleleFrequencies_chr*.txt}" + - metrics: + type: file + description: File containing quality metrics + pattern: "*.{metrics.txt}" + - png: + type: file + description: ASCAT plots + pattern: "*.{png}" + - purityploidy: + type: file + description: File with purity and ploidy data + pattern: "*.{purityploidy.txt}" + - segments: + type: file + description: File with multi-sample segments data + pattern: "*.{asmultipcf_segments.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@aasNGC" + - "@lassefolkersen" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" +maintainers: + - "@aasNGC" + - "@lassefolkersen" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" diff --git a/nextflow_schema.json b/nextflow_schema.json index 1611d58f40..6050125346 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -356,7 +356,7 @@ }, "cf_ploidy": { "type": "string", - "default": "2", + "default": 2, "fa_icon": "fas fa-bacon", "help_text": "In case of doubt, you can set different values and Control-FREEC will select the one that explains most observed CNAs Example: ploidy=2 , ploidy=2,3,4. For more details, see the [manual](http://boevalab.inf.ethz.ch/FREEC/tutorial.html).", "description": "Genome ploidy used by ControlFREEC", @@ -1061,7 +1061,8 @@ "fa_icon": "far fa-check-circle", "description": "Validation of parameters in lenient more.", "hidden": true, - "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode).", + "default": true }, "hook_url": { "type": "string", @@ -1111,5 +1112,10 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "asmultipcf": { + "type": "boolean" + } + } } From 29c45ac3199b8197aa3781f71835260cd05ae46d Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Tue, 10 Dec 2024 10:13:21 +0000 Subject: [PATCH 10/10] [automated] Fix code linting --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index beefd4f1de..a0d0ce3a32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [1638](https://github.com/nf-core/sarek/pull/1638) - Added additional documentation detailing ASCAT WES usage. - [1640](https://github.com/nf-core/sarek/pull/1620) - Add `lofreq` as a tumor-only variant caller - [1642](https://github.com/nf-core/sarek/pull/1642) - Back to dev -- [1646](https://github.com/nf-core/sarek/pull/1646) - Added asmultipcf functionality for multisample ASCAT calls. +- [1646](https://github.com/nf-core/sarek/pull/1646) - Added asmultipcf functionality for multisample ASCAT calls. - [1653](https://github.com/nf-core/sarek/pull/1653) - Updates `sarek_subway` files with `lofreq` - [1660](https://github.com/nf-core/sarek/pull/1642) - Add `--length_required` for minimal reads length with `FASTP` - [1663](https://github.com/nf-core/sarek/pull/1663) - Massive conda modules update