Merge pull request #1366 from broadinstitute/staging

staging --> master
broadinstitute · Sep 10, 2024 · 6b1946f · 6b1946f
2 parents d37a62a + e3fc193
commit 6b1946f
Show file tree

Hide file tree

Showing 15 changed files with 285 additions and 53 deletions.
diff --git a/pipeline_versions.txt b/pipeline_versions.txt
@@ -1,42 +1,42 @@
 Pipeline Name Version Date of Last Commit
-Optimus 7.6.0 2024-08-06 
-Multiome 5.5.0 2024-08-06 
-PairedTag 1.5.0 2024-08-06 
-atac 2.2.3 2024-08-02 
-SlideSeq 3.4.0 2024-08-06 
-snm3C 4.0.3 2024-08-05 
 MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02 
-scATAC 1.3.2 2023-08-03 
+MultiSampleSmartSeq2 2.2.21 2023-04-19 
+PairedTag 1.6.0 2024-08-02 
+Optimus 7.6.0 2024-08-06 
+atac 2.3.0 2024-08-29 
+snm3C 4.0.4 2024-08-06 
 SmartSeq2SingleSample 5.1.20 2023-04-19 
+Multiome 5.6.0 2024-08-02 
+scATAC 1.3.2 2023-08-03 
 BuildIndices 3.0.0 2023-12-06 
-MultiSampleSmartSeq2 2.2.21 2023-04-19 
-CEMBA 1.1.6 2023-12-18 
+SlideSeq 3.4.0 2024-08-06 
 BuildCembaReferences 1.0.0 2020-11-15 
-UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02 
+CEMBA 1.1.6 2023-12-18 
 GDCWholeGenomeSomaticSingleSample 1.3.2 2024-08-02 
-ExomeGermlineSingleSample 3.1.22 2024-06-12 
-UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02 
-WholeGenomeGermlineSingleSample 3.2.1 2024-06-12 
-VariantCalling 2.2.1 2024-06-12 
+UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02 
+JointGenotypingByChromosomePartOne 1.4.12 2023-12-18 
+JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18 
 UltimaGenomicsJointGenotyping 1.1.7 2023-12-18 
 JointGenotyping 1.6.10 2023-12-18 
 ReblockGVCF 2.2.1 2024-06-12 
-JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18 
-JointGenotypingByChromosomePartOne 1.4.12 2023-12-18 
-ExternalExomeReprocessing 3.2.2 2024-08-02 
-ExternalWholeGenomeReprocessing 2.2.2 2024-08-02 
-ExomeReprocessing 3.2.2 2024-08-02 
-CramToUnmappedBams 1.1.3 2024-08-02 
-WholeGenomeReprocessing 3.2.2 2024-08-02 
-IlluminaGenotypingArray 1.12.21 2024-08-02 
-Arrays 2.6.27 2024-08-02 
-MultiSampleArrays 1.6.2 2024-08-02 
+VariantCalling 2.2.1 2024-06-12 
+WholeGenomeGermlineSingleSample 3.2.1 2024-06-12 
+UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02 
+ExomeGermlineSingleSample 3.1.22 2024-06-12 
 ValidateChip 1.16.5 2024-08-02 
+Arrays 2.6.27 2024-08-02 
 Imputation 1.1.13 2024-05-21 
-RNAWithUMIsPipeline 1.0.16 2023-12-18 
+MultiSampleArrays 1.6.2 2024-08-02 
 BroadInternalUltimaGenomics 1.0.21 2024-08-02 
 BroadInternalArrays 1.1.11 2024-08-02 
 BroadInternalImputation 1.1.12 2024-08-02 
 BroadInternalRNAWithUMIs 1.0.33 2024-08-02 
+CramToUnmappedBams 1.1.3 2024-08-02 
+ExternalWholeGenomeReprocessing 2.2.2 2024-08-02 
+ExternalExomeReprocessing 3.2.2 2024-08-02 
+WholeGenomeReprocessing 3.2.2 2024-08-02 
+ExomeReprocessing 3.2.2 2024-08-02 
+IlluminaGenotypingArray 1.12.21 2024-08-02 
 CheckFingerprint 1.0.20 2024-08-02 
 AnnotationFiltration 1.2.5 2023-12-18 
+RNAWithUMIsPipeline 1.0.16 2023-12-18 
diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md
@@ -1,3 +1,10 @@
+# 2.3.0
+2024-08-29 (Date of Last Commit)
+
+* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. 
+
+* Updated the memory for the CreateFragmentFile task
+
 # 2.2.3
 2024-08-02 (Date of Last Commit)
 

diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl
@@ -46,7 +46,7 @@ workflow ATAC {
  String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
  }
 
- String pipeline_version = "2.2.3"
+ String pipeline_version = "2.3.0"
 
  # Determine docker prefix based on cloud provider
  String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
@@ -58,7 +58,7 @@ workflow ATAC {
  String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919"
  String samtools_docker = "samtools-dist-bwa:3.0.0"
  String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311"
- String snap_atac_docker = "snapatac2:1.0.9-2.6.3-1715865353"
+ String snap_atac_docker = "snapatac2:1.1.0"
 
  # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error
  if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
@@ -158,11 +158,13 @@ workflow ATAC {
  File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output])
  File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file])
  File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics])
+ File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics])
 
  output {
  File bam_aligned_output = bam_aligned_output_atac
  File fragment_file = fragment_file_atac
  File snap_metrics = snap_metrics_atac
+ File library_metrics_file = library_metrics
  }
 }
 
@@ -505,7 +507,7 @@ task CreateFragmentFile {
  File annotations_gtf
  Boolean preindex
  Int disk_size = 500
- Int mem_size = 16
+ Int mem_size = 64
  Int nthreads = 4
  String cpuPlatform = "Intel Cascade Lake"
  String docker_path
@@ -547,17 +549,35 @@ task CreateFragmentFile {
  import snapatac2.preprocessing as pp
  import snapatac2 as snap
  import anndata as ad
+ from collections import OrderedDict
+ import csv
 
  # extract CB or BB (if preindex is true) tag from bam file to create fragment file
  if preindex == "true":
- pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="BB")
+ data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
  elif preindex == "false":
- pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB")
- 
+ data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
+ 
+ # Add NHashID to metrics 
+ nhash_ID_value = "XXX"
+ data = OrderedDict({'NHash_ID': atac_nhash_id, **data})
+ # Flatten the dictionary
+ flattened_data = []
+ for category, metrics in data.items():
+ if isinstance(metrics, dict):
+ for metric, value in metrics.items():
+ flattened_data.append((metric, value))
+ else:
+ flattened_data.append((category, metrics))
+
+ # Write to CSV
+ csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
+ with open(csv_file_path, mode='w', newline='') as file:
+ writer = csv.writer(file)
+ writer.writerows(flattened_data) # Write data
+
+ print(f"Dictionary successfully written to {csv_file_path}")
 
- # calculate quality metrics; note min_num_fragments and min_tsse are set to 0 instead of default
- # those settings allow us to retain all barcodes
- pp.import_data("~{bam_base_name}.fragments.tsv", file="temp_metrics.h5ad", chrom_sizes=chrom_size_dict, min_num_fragments=0)
  atac_data = ad.read_h5ad("temp_metrics.h5ad")
  # Add nhash_id to h5ad file as unstructured metadata
  atac_data.uns['NHashID'] = atac_nhash_id
@@ -580,5 +600,6 @@ task CreateFragmentFile {
  output {
  File fragment_file = "~{bam_base_name}.fragments.tsv"
  File Snap_metrics = "~{bam_base_name}.metrics.h5ad"
+ File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
  }
 }
diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md
@@ -1,3 +1,8 @@
+# 5.6.0
+2024-08-02 (Date of Last Commit)
+
+* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM.
+
 # 5.5.0
 2024-08-06 (Date of Last Commit)
 

diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl
@@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow Multiome {
 
- String pipeline_version = "5.5.0"
+ String pipeline_version = "5.6.0"
 
 
  input {
@@ -179,6 +179,7 @@ workflow Multiome {
  File fragment_file_atac = JoinBarcodes.atac_fragment_tsv
  File fragment_file_index = JoinBarcodes.atac_fragment_tsv_tbi
  File snap_metrics_atac = JoinBarcodes.atac_h5ad_file
+ File atac_library_metrics = Atac.library_metrics_file
 
  # optimus outputs
  File genomic_reference_version_gex = Optimus.genomic_reference_version

diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md
@@ -1,3 +1,8 @@
+# 1.6.0
+2024-08-02 (Date of Last Commit)
+
+* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM.
+
 # 1.5.0
 2024-08-06 (Date of Last Commit)
 

diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow PairedTag {
 
- String pipeline_version = "1.5.0"
+ String pipeline_version = "1.6.0"
 
 
  input {
@@ -149,6 +149,7 @@ workflow PairedTag {
 
  File atac_fragment_out = select_first([ParseBarcodes.atac_fragment_tsv,Atac_preindex.fragment_file])
  File atac_h5ad_out = select_first([ParseBarcodes.atac_h5ad_file, Atac_preindex.snap_metrics])
+
  output {
 
  String pairedtag_pipeline_version_out = pipeline_version
@@ -157,6 +158,7 @@ workflow PairedTag {
  File bam_aligned_output_atac = Atac_preindex.bam_aligned_output
  File fragment_file_atac = atac_fragment_out
  File snap_metrics_atac = atac_h5ad_out
+ File atac_library_final = Atac_preindex.library_metrics_file
 
  # optimus outputs
  File genomic_reference_version_gex = Optimus.genomic_reference_version

diff --git a/pipelines/skylab/paired_tag/README.md b/pipelines/skylab/paired_tag/README.md
@@ -1,6 +1,6 @@
 ## Announcing a new site for WARP documentation!
 
-Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)!
+Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)! 
 
 ### Paired-Tag summary
 

diff --git a/pipelines/skylab/snm3C/snm3C.changelog.md b/pipelines/skylab/snm3C/snm3C.changelog.md
@@ -1,7 +1,12 @@
+# 4.0.4
+2024-08-06 (Date of Last Commit)
+
+* Updated the Demultiplexing task in the snm3C wdl to flag when file/cell is empty
+
 # 4.0.3
-2024-08-05 (Date of Last Commit)
+2024-08-06 (Date of Last Commit)
 
-* Updated the demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present
+* Updated the Demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present
 
 # 4.0.2
 2024-07-09 (Date of Last Commit)

diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl
@@ -44,7 +44,7 @@ workflow snm3C {
  }
 
  # version of the pipeline
- String pipeline_version = "4.0.3"
+ String pipeline_version = "4.0.4"
 
  call Demultiplexing {
  input:
@@ -154,6 +154,8 @@ task Demultiplexing {
  File random_primer_indexes
  String plate_id
  Int batch_number
+ Int min_threshold = 100
+ Int max_threshold = 10000000
  String docker
 
  Int disk_size = 1000
@@ -179,7 +181,7 @@ task Demultiplexing {
  $WORKING_DIR/r2.fastq.gz \
  > $WORKING_DIR/~{plate_id}.stats.txt
 
- # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz
+ # Remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz
  rm $WORKING_DIR/*-unknown-R{1,2}.fq.gz
 
  python3 <<CODE
@@ -199,27 +201,31 @@ task Demultiplexing {
  trimmed_count = int(adapter_match[1])
  adapter_counts[adapter_name] = trimmed_count
 
- # Removing fastq files with trimmed reads greater than 30
- threshold = 10000000
-
+ # Removing fastq files with trimmed reads greater than 10000000 or less than 100
  for filename in os.listdir(working_dir):
  if filename.endswith('.fq.gz'):
  file_path = os.path.join(working_dir, filename)
- adapter_name = re.search(r'A(\d+)-R', filename)
+ adapter_name = re.search(r'([A-Za-z]\d+)-R', filename).group(1)
  if adapter_name:
- adapter_name = 'A' + adapter_name.group(1)
- if adapter_name in adapter_counts and adapter_counts[adapter_name] > threshold:
- os.remove(file_path)
+ if adapter_name in adapter_counts:
+ if adapter_counts[adapter_name] < ~{min_threshold} or adapter_counts[adapter_name] > ~{max_threshold}:
+ print("Removing ", file_path, " with count equal to ", adapter_counts[adapter_name])
+ os.remove(file_path)
  CODE
+ 
+ # Check if the number of *R1.fq.gz files is 0
+ if [[ $(ls | grep "\-R1.fq.gz" | wc -l) -eq 0 ]]; then
+ echo "Error: No files found. All fastq files were removed. Exiting."
+ exit 1
+ fi
 
  # Batch the fastq files into folders of batch_number size
  R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz"))
  R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz"))
+ batch_number=~{batch_number}
  total_files=${#R1_files[@]}
  echo "Total files: $total_files"
 
- batch_number=~{batch_number}
-
  if [[ $total_files -lt $batch_number ]]; then
  echo "Warning: Number of files is less than the batch number. Updating batch number to $total_files."
  batch_number=$total_files
@@ -229,14 +235,14 @@ task Demultiplexing {
  mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs
  done
 
- # Counter for the folder index
+ # Counter for the folder index and create emptycells file
  folder_index=1
- WORKING_DIR=`pwd`
 
  # Distribute the FASTQ files and create TAR files
  for file in "${R1_files[@]}"; do
  sample_id=$(basename "$file" "-R1.fq.gz")
  r2_file="${sample_id}-R2.fq.gz"
+ 
  mv $WORKING_DIR/$file batch$((folder_index))/$file
  mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file
  # Increment the counter
@@ -249,7 +255,7 @@ task Demultiplexing {
  tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > ~{plate_id}.${i}.cutadapt_output_files.tar.gz
  done
  >>>
-
+ 
  runtime {
  docker: docker
  disks: "local-disk ${disk_size} SSD"