From 27f0e8348c6fde9bfff80d82a3b23257734a3f5f Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 14 Aug 2024 14:47:31 -0400 Subject: [PATCH 01/18] PD-2721: Add checks for empty cells (#1353) --- pipeline_versions.txt | 2 +- pipelines/skylab/snm3C/snm3C.changelog.md | 9 ++++-- pipelines/skylab/snm3C/snm3C.wdl | 34 +++++++++++++---------- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index c80f1a60b..f8a8c4a1a 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -4,7 +4,7 @@ Multiome 5.5.0 2024-08-06 PairedTag 1.5.0 2024-08-06 atac 2.2.3 2024-08-02 SlideSeq 3.4.0 2024-08-06 -snm3C 4.0.3 2024-08-05 +snm3C 4.0.4 2024-08-06 MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02 scATAC 1.3.2 2023-08-03 SmartSeq2SingleSample 5.1.20 2023-04-19 diff --git a/pipelines/skylab/snm3C/snm3C.changelog.md b/pipelines/skylab/snm3C/snm3C.changelog.md index 7690c4ca2..91b7cab98 100644 --- a/pipelines/skylab/snm3C/snm3C.changelog.md +++ b/pipelines/skylab/snm3C/snm3C.changelog.md @@ -1,7 +1,12 @@ +# 4.0.4 +2024-08-06 (Date of Last Commit) + +* Updated the Demultiplexing task in the snm3C wdl to flag when file/cell is empty + # 4.0.3 -2024-08-05 (Date of Last Commit) +2024-08-06 (Date of Last Commit) -* Updated the demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present +* Updated the Demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present # 4.0.2 2024-07-09 (Date of Last Commit) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 953d9912f..b7387032f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -44,7 +44,7 @@ workflow snm3C { } # version of the pipeline - String pipeline_version = "4.0.3" + String pipeline_version = "4.0.4" call Demultiplexing { input: @@ -154,6 +154,8 @@ task Demultiplexing { File random_primer_indexes String plate_id Int batch_number + Int min_threshold = 100 + Int max_threshold = 10000000 String docker Int disk_size = 1000 @@ -179,7 +181,7 @@ task Demultiplexing { $WORKING_DIR/r2.fastq.gz \ > $WORKING_DIR/~{plate_id}.stats.txt - # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz + # Remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm $WORKING_DIR/*-unknown-R{1,2}.fq.gz python3 < threshold: - os.remove(file_path) + if adapter_name in adapter_counts: + if adapter_counts[adapter_name] < ~{min_threshold} or adapter_counts[adapter_name] > ~{max_threshold}: + print("Removing ", file_path, " with count equal to ", adapter_counts[adapter_name]) + os.remove(file_path) CODE + + # Check if the number of *R1.fq.gz files is 0 + if [[ $(ls | grep "\-R1.fq.gz" | wc -l) -eq 0 ]]; then + echo "Error: No files found. All fastq files were removed. Exiting." + exit 1 + fi # Batch the fastq files into folders of batch_number size R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz")) R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz")) + batch_number=~{batch_number} total_files=${#R1_files[@]} echo "Total files: $total_files" - batch_number=~{batch_number} - if [[ $total_files -lt $batch_number ]]; then echo "Warning: Number of files is less than the batch number. Updating batch number to $total_files." batch_number=$total_files @@ -229,14 +235,14 @@ task Demultiplexing { mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs done - # Counter for the folder index + # Counter for the folder index and create emptycells file folder_index=1 - WORKING_DIR=`pwd` # Distribute the FASTQ files and create TAR files for file in "${R1_files[@]}"; do sample_id=$(basename "$file" "-R1.fq.gz") r2_file="${sample_id}-R2.fq.gz" + mv $WORKING_DIR/$file batch$((folder_index))/$file mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file # Increment the counter @@ -249,7 +255,7 @@ task Demultiplexing { tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > ~{plate_id}.${i}.cutadapt_output_files.tar.gz done >>> - + runtime { docker: docker disks: "local-disk ${disk_size} SSD" From c350acb1321e60b45c138cdb979cfd7ca53b47a8 Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:59:07 -0400 Subject: [PATCH 02/18] Np add firecloud api scripts (#1358) * pin all latest docker version * add firecloud scripts * add testing scripts --- scripts/firecloud_api/GetWorkflowOutputs.py | 53 +++++++++++ .../SubmitWorkflowAndGetStatus.py | 90 +++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 scripts/firecloud_api/GetWorkflowOutputs.py create mode 100644 scripts/firecloud_api/SubmitWorkflowAndGetStatus.py diff --git a/scripts/firecloud_api/GetWorkflowOutputs.py b/scripts/firecloud_api/GetWorkflowOutputs.py new file mode 100644 index 000000000..8970a888d --- /dev/null +++ b/scripts/firecloud_api/GetWorkflowOutputs.py @@ -0,0 +1,53 @@ +import requests +import argparse + +def get_workflow_outputs(token, namespace, workspace_name, submission_id, workflow_id, pipeline_name): + # API endpoint to get the workflow outputs + url = f"https://api.firecloud.org/api/workspaces/{namespace}/{workspace_name}/submissions/{submission_id}/workflows/{workflow_id}/outputs" + print(f"Requesting URL: {url}") + + # Headers including the authorization token + headers = { + 'accept': '*/*', + 'Authorization': f'Bearer {token}', + } + + # Make the GET request + response = requests.get(url, headers=headers) + + # Check if the request was successful + if response.status_code == 200: + json_response = response.json() # parse the JSON response + # extract the outputs section using the task name + outputs = json_response.get('tasks', {}).get(pipeline_name, {}).get('outputs', {}) + + # Turn the outputs dictionary into a list of values + output_values = list(outputs.values()) + + return outputs, output_values + else: + print(f"Failed to retrieve workflow outputs. Status code: {response.status_code}") + return None, None + +if __name__ == "__main__": + # Define the command-line arguments + parser = argparse.ArgumentParser(description='Fetch workflow outputs from the API.') + parser.add_argument('--token', required=True, help='Authentication token') + parser.add_argument('--namespace', required=True, help='Workspace namespace') + parser.add_argument('--workspace', required=True, help='Workspace name') + parser.add_argument('--submission_id', required=True, help='Submission ID') + parser.add_argument('--workflow_id', required=True, help='Workflow ID') + parser.add_argument('--pipeline_name', required=True, help='Name of the pipeline') + + # Parse the arguments + args = parser.parse_args() + + # Call the function with the parsed arguments + outputs, output_values = get_workflow_outputs(args.token, args.namespace, args.workspace, args.submission_id, args.workflow_id, args.pipeline_name) + + if outputs: + print("Outputs:") + print(outputs) + + print("\nOutput Values:") + print(output_values) \ No newline at end of file diff --git a/scripts/firecloud_api/SubmitWorkflowAndGetStatus.py b/scripts/firecloud_api/SubmitWorkflowAndGetStatus.py new file mode 100644 index 000000000..3276bf415 --- /dev/null +++ b/scripts/firecloud_api/SubmitWorkflowAndGetStatus.py @@ -0,0 +1,90 @@ +import requests +import time +import argparse +import json + +def create_submission(token, workspace_namespace, workspace_name, submission_data): + # API endpoint + base_url = f'https://api.firecloud.org/api/workspaces/{workspace_namespace}/{workspace_name}/submissions' + + # Headers to make API requests + headers = { + 'accept': 'application/json', + 'Authorization': f'Bearer {token}', + 'Content-Type': 'application/json' + } + + # Create the submission + # send an HTTP POST request to the API endpoint + response = requests.post(base_url, headers=headers, json=submission_data) + # convert the response json into a dictionary + submission_response = response.json() + # extract the submission ID from the response dictionary + submission_id = submission_response.get("submissionId") + + if not submission_id: + print("Failed to create submission.") + else: + print(f"Submission created with ID: {submission_id}") + return submission_id + +def poll_submission_status(token, workspace_namespace, workspace_name, submission_id): + + # Status endpoint + status_url = f'https://api.firecloud.org/api/workspaces/{workspace_namespace}/{workspace_name}/submissions/{submission_id}' + + # Headers to make API requests + headers = { + 'accept': 'application/json', + 'Authorization': f'Bearer {token}' + } + + # polling the submission status + # create an emptu list to store the previous workflow status + previous_workflow_status = [] + + # loop until the submission is done + while True: + # send a get request and convert the response json into a dictionary + status_response = requests.get(status_url, headers=headers) + status_data = status_response.json() + + # get the submission status + submission_status = status_data.get("status") + # get the workflow status of each workflow in the submission + workflows_status = [workflow.get("status") for workflow in status_data.get("workflows", [])] + + # print the workflow status to stdout if it has changed + if workflows_status != previous_workflow_status: + print(f"Workflows Status: {workflows_status}") + previous_workflow_status = workflows_status + + # Check if the submission has completed + if submission_status == "Done" and "Failed" in workflows_status: + print("At least one workflow has failed.") + break + elif submission_status == "Done": + break + + # Wait for 10 seconds before polling again + time.sleep(10) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Submit and monitor a job.') + parser.add_argument('--token', required=True, help='API access token') + parser.add_argument('--workspace-namespace', required=True, help='Workspace namespace') + parser.add_argument('--workspace-name', required=True, help='Workspace name') + parser.add_argument('--submission-data-file', required=True, help='Path to the JSON file containing submission data') + + args = parser.parse_args() + + # load submission data from JSON file + with open(args.submission_data_file, 'r') as file: + submission_data = json.load(file) + + # create submission and get submission ID + submission_id = create_submission(args.token, args.workspace_namespace, args.workspace_name, submission_data) + + if submission_id: + # Poll submission status + poll_submission_status(args.token, args.workspace_namespace, args.workspace_name, submission_id) From bb5d9c2a032a361734d56b51ee5b7edcc3f911b1 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 28 Aug 2024 15:30:25 -0400 Subject: [PATCH 03/18] adding 10x wrapper function --- pipelines/skylab/atac/atac.wdl | 36 ++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 45f6a7175..9ed8ce532 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -58,7 +58,7 @@ workflow ATAC { String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" - String snap_atac_docker = "snapatac2:1.0.9-2.6.3-1715865353" + String snap_atac_docker = "snapatac2:lk-PD-2738" # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { @@ -158,11 +158,13 @@ workflow ATAC { File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) + File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics]) output { File bam_aligned_output = bam_aligned_output_atac File fragment_file = fragment_file_atac File snap_metrics = snap_metrics_atac + File library_metrics_file = library_metrics } } @@ -547,13 +549,38 @@ task CreateFragmentFile { import snapatac2.preprocessing as pp import snapatac2 as snap import anndata as ad + from collections import OrderedDict + import csv # extract CB or BB (if preindex is true) tag from bam file to create fragment file if preindex == "true": - pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="BB") + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) elif preindex == "false": - pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB") - + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + + # Add NHashID to metrics + nhash_ID_value = "XXX" + data = OrderedDict({'NHash_ID': atac_nhash_id, **data}) + # Flatten the dictionary + flattened_data = [] + for category, metrics in data.items(): + if isinstance(metrics, dict): + for metric, value in metrics.items(): + flattened_data.append((metric, value)) + else: + flattened_data.append((category, metrics)) + + # Write to CSV + csv_file_path = "~{bam_base_name}_~{atac_nhash}.atac_metrics.csv" + with open(csv_file_path, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerow(['Metric', 'Value']) # Write header + writer.writerows(flattened_data) # Write data + + print(f"Dictionary successfully written to {csv_file_path}") + + + # calculate quality metrics; note min_num_fragments and min_tsse are set to 0 instead of default # those settings allow us to retain all barcodes @@ -580,5 +607,6 @@ task CreateFragmentFile { output { File fragment_file = "~{bam_base_name}.fragments.tsv" File Snap_metrics = "~{bam_base_name}.metrics.h5ad" + File atac_library_metrics = "~{bam_base_name}_~{atac_nhash}.atac_metrics.csv" } } From 6340acad47835658f264af02fe62c36067eb618d Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 28 Aug 2024 15:34:04 -0400 Subject: [PATCH 04/18] fixing nhash_id variable --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 9ed8ce532..67166fbcf 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -571,7 +571,7 @@ task CreateFragmentFile { flattened_data.append((category, metrics)) # Write to CSV - csv_file_path = "~{bam_base_name}_~{atac_nhash}.atac_metrics.csv" + csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" with open(csv_file_path, mode='w', newline='') as file: writer = csv.writer(file) writer.writerow(['Metric', 'Value']) # Write header @@ -607,6 +607,6 @@ task CreateFragmentFile { output { File fragment_file = "~{bam_base_name}.fragments.tsv" File Snap_metrics = "~{bam_base_name}.metrics.h5ad" - File atac_library_metrics = "~{bam_base_name}_~{atac_nhash}.atac_metrics.csv" + File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" } } From ae011ccd51b4f83545c038a15fd6fc2d6fc9dcb0 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 08:27:37 -0400 Subject: [PATCH 05/18] added h5ad output --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 67166fbcf..c39af5fb2 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -554,7 +554,7 @@ task CreateFragmentFile { # extract CB or BB (if preindex is true) tag from bam file to create fragment file if preindex == "true": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) elif preindex == "false": data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) From 9e91e40cb0d4e8e7a789edc4ba8814022b455e39 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 09:24:19 -0400 Subject: [PATCH 06/18] adding h5ad argument to second command --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index c39af5fb2..e1abc0aac 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -556,7 +556,7 @@ task CreateFragmentFile { if preindex == "true": data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) elif preindex == "false": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) # Add NHashID to metrics nhash_ID_value = "XXX" From f553620e650aeac4d791de445ade178212b099de Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 09:24:53 -0400 Subject: [PATCH 07/18] removing temp metrics --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e1abc0aac..a07f4c2a7 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -584,7 +584,7 @@ task CreateFragmentFile { # calculate quality metrics; note min_num_fragments and min_tsse are set to 0 instead of default # those settings allow us to retain all barcodes - pp.import_data("~{bam_base_name}.fragments.tsv", file="temp_metrics.h5ad", chrom_sizes=chrom_size_dict, min_num_fragments=0) + atac_data = ad.read_h5ad("temp_metrics.h5ad") # Add nhash_id to h5ad file as unstructured metadata atac_data.uns['NHashID'] = atac_nhash_id From 9f62353ab347735f489109502742e6ce90eef3e2 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 09:41:36 -0400 Subject: [PATCH 08/18] Update atac.wdl --- pipelines/skylab/atac/atac.wdl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index a07f4c2a7..b5d2b3f02 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -579,12 +579,6 @@ task CreateFragmentFile { print(f"Dictionary successfully written to {csv_file_path}") - - - - # calculate quality metrics; note min_num_fragments and min_tsse are set to 0 instead of default - # those settings allow us to retain all barcodes - atac_data = ad.read_h5ad("temp_metrics.h5ad") # Add nhash_id to h5ad file as unstructured metadata atac_data.uns['NHashID'] = atac_nhash_id From 2e597da2d3a2c41ab326d87e06ba05a7d012a61c Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 09:48:45 -0400 Subject: [PATCH 09/18] removing header in ATAC and starting documentation for metrics --- pipelines/skylab/atac/atac.wdl | 1 - website/docs/Pipelines/ATAC/library-metrics.md | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 website/docs/Pipelines/ATAC/library-metrics.md diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index b5d2b3f02..5fcb3ffa7 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -574,7 +574,6 @@ task CreateFragmentFile { csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" with open(csv_file_path, mode='w', newline='') as file: writer = csv.writer(file) - writer.writerow(['Metric', 'Value']) # Write header writer.writerows(flattened_data) # Write data print(f"Dictionary successfully written to {csv_file_path}") diff --git a/website/docs/Pipelines/ATAC/library-metrics.md b/website/docs/Pipelines/ATAC/library-metrics.md new file mode 100644 index 000000000..a16db5b9d --- /dev/null +++ b/website/docs/Pipelines/ATAC/library-metrics.md @@ -0,0 +1,12 @@ +--- +sidebar_position: 2 +--- + +# ATAC Library Metrics Overview + +The [ATAC pipeline](README.md) uses [SnapATAC2](https://github.com/kaizhang/SnapATAC2) to generate library-level metrics in CSV format. + + +| Metric | Description | +| --- | --- | + From 1941d99c4ce927e11c626e4b824ff32a90f04e8b Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 09:53:35 -0400 Subject: [PATCH 10/18] added metric definitions to the library overview --- .../docs/Pipelines/ATAC/library-metrics.md | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/website/docs/Pipelines/ATAC/library-metrics.md b/website/docs/Pipelines/ATAC/library-metrics.md index a16db5b9d..184cfeb8e 100644 --- a/website/docs/Pipelines/ATAC/library-metrics.md +++ b/website/docs/Pipelines/ATAC/library-metrics.md @@ -8,5 +8,28 @@ The [ATAC pipeline](README.md) uses [SnapATAC2](https://github.com/kaizhang/Snap | Metric | Description | -| --- | --- | +| --- | --- | +| NHash_ID | A unique identifier used to track and reference the specific sample or dataset. | +| Sequenced_reads | The total number of reads generated from the sequencing process, which includes both reads that are mapped and unmapped. | +| Sequenced_read_pairs | The total number of read pairs (two reads per pair) generated from the sequencing process. This is typically half of the total sequenced reads if all reads are paired. | +| Fraction_valid_barcode | The fraction of reads that contain a valid barcode, indicating the proportion of reads that are correctly assigned to a specific cell or sample. | +| Fraction_Q30_bases_in_read_1 | The proportion of bases in Read 1 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| Fraction_Q30_bases_in_read_2 | The proportion of bases in Read 2 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| Number_of_cells | The estimated number of cells captured and sequenced in the experiment, based on the barcodes identified. | +| Mean_raw_read_pairs_per_cell | The average number of raw read pairs associated with each cell, providing an indication of the sequencing depth per cell. | +| Median_high-quality_fragments_per_cell | The median number of high-quality (e.g., confidently mapped) fragments associated with each cell, representing typical fragment quality across cells. | +| Fraction of high-quality fragments in cells | The fraction of high-quality fragments that are associated with identified cells, indicating the proportion of good-quality data that is cell-associated. | +| Fraction_of_transposition_events_in_peaks_in_cells | The fraction of transposition events within identified cells that occur within peaks, which are regions of accessible chromatin. | +| Fraction_duplicates | The fraction of sequenced fragments that are duplicates, which can result from PCR amplification or other factors, indicating the redundancy in the sequencing data. | +| Fraction_confidently_mapped | The fraction of sequenced fragments that are confidently mapped to the reference genome, indicating the proportion of reads that align well to the genome. | +| Fraction_unmapped | The fraction of sequenced fragments that could not be mapped to the reference genome, which can indicate sequencing errors, contamination, or regions not covered by the reference. | +| Fraction_nonnuclear | The fraction of sequenced fragments that are mapped to non-nuclear (e.g., mitochondrial or other organellar) DNA, providing insight into contamination or organellar activity. | +| Fraction_fragment_in_nucleosome_free_region | The fraction of sequenced fragments that map to nucleosome-free regions, which are indicative of accessible chromatin. | +| Fraction_fragment_flanking_single_nucleosome | The fraction of sequenced fragments that map to regions flanking single nucleosomes, indicating regions with partial chromatin accessibility. | +| TSS_enrichment_score | A measure of the enrichment of transposition events at transcription start sites (TSS), indicating the accessibility of promoters across the genome. | +| Fraction_of_high-quality_fragments_overlapping_TSS | The fraction of high-quality fragments that overlap transcription start sites (TSS), providing insight into promoter accessibility. | +| Number_of_peaks | The total number of peaks, or regions of accessible chromatin, identified in the dataset, representing potential regulatory elements. | +| Fraction_of_genome_in_peaks | The fraction of the genome that is covered by identified peaks, indicating the extent of chromatin accessibility across the genome. | +| Fraction_of_high-quality_fragments_overlapping_peaks | The fraction of high-quality fragments that overlap with identified peaks, providing an indication of the efficiency of the assay in capturing accessible regions. | + From 0523b89bfef6693bceeb9f0696fbf88052987e77 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 12:33:18 -0400 Subject: [PATCH 11/18] updating Multiome documentation and outputs with new ATAC metrics --- pipelines/skylab/multiome/Multiome.wdl | 1 + website/docs/Pipelines/ATAC/README.md | 1 + website/docs/Pipelines/Multiome_Pipeline/README.md | 1 + 3 files changed, 3 insertions(+) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 21584c01d..0ecb33aa2 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -179,6 +179,7 @@ workflow Multiome { File fragment_file_atac = JoinBarcodes.atac_fragment_tsv File fragment_file_index = JoinBarcodes.atac_fragment_tsv_tbi File snap_metrics_atac = JoinBarcodes.atac_h5ad_file + File atac_library_metrics = Atac.library_metrics # optimus outputs File genomic_reference_version_gex = Optimus.genomic_reference_version diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 1bb2dd639..fc3a985ab 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -93,6 +93,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | bam_aligned_output | ``.bam | BAM containing aligned reads from ATAC workflow. | | fragment_file | ``.fragments.tsv | TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_``.atac_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) ## Versioning and testing diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index d77c5ec3b..afb277766 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -107,6 +107,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | fragment_file_atac | `_atac.fragments.sorted.tsv.gz` | Sorted and bgzipped TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "ATAC Barcode", "Number of reads", and "GEX Barcode". | | fragment_file_index | `_atac.fragments.sorted.tsv.gz.tbi` | tabix index file for the fragment file. | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. Also contains the equivalent gene expression barcode for each ATAC barcode in the `gex_barcodes` column of the `h5ad.obs` property. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. | +| atac_library_metrics | `_.atac.metrics.csv` | CSV with library-level metrics produced by SnapATAC2. See the ATAC [Library Level Metrics Overview](../ATAC/library-metrics.md) for more details. | | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | | matrix_gex | `_gex_sparse_counts.npz` | NPZ file containing raw gene by cell counts. | From 17b2f94507b9efcf6f1b8b50188888fc8aeaa67c Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 12:46:44 -0400 Subject: [PATCH 12/18] updated changelogs, pipeline ersions, and final outputs --- pipelines/skylab/atac/atac.changelog.md | 5 +++++ pipelines/skylab/atac/atac.wdl | 2 +- pipelines/skylab/multiome/Multiome.changelog.md | 5 +++++ pipelines/skylab/multiome/Multiome.wdl | 4 ++-- pipelines/skylab/paired_tag/PairedTag.changelog.md | 5 +++++ pipelines/skylab/paired_tag/PairedTag.wdl | 4 +++- 6 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index ffe875fa0..1207b32c1 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,3 +1,8 @@ +# 2.3.0 +2024-08-29 (Date of Last Commit) + +* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. + # 2.2.3 2024-08-02 (Date of Last Commit) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 5fcb3ffa7..bc4d0c11f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -46,7 +46,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.2.3" + String pipeline_version = "2.3.0" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index afc52d57f..98904837e 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,8 @@ +# 5.6.0 +2024-08-02 (Date of Last Commit) + +* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. + # 5.5.0 2024-08-06 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 0ecb33aa2..d647e8294 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.5.0" + String pipeline_version = "5.6.0" input { @@ -179,7 +179,7 @@ workflow Multiome { File fragment_file_atac = JoinBarcodes.atac_fragment_tsv File fragment_file_index = JoinBarcodes.atac_fragment_tsv_tbi File snap_metrics_atac = JoinBarcodes.atac_h5ad_file - File atac_library_metrics = Atac.library_metrics + File atac_library_metrics = Atac.library_metrics_file # optimus outputs File genomic_reference_version_gex = Optimus.genomic_reference_version diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index e9da183ec..ba4a05376 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,8 @@ +# 1.6.0 +2024-08-02 (Date of Last Commit) + +* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. + # 1.5.0 2024-08-06 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index e35a153de..4206f4fab 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.5.0" + String pipeline_version = "1.6.0" input { @@ -149,6 +149,7 @@ workflow PairedTag { File atac_fragment_out = select_first([ParseBarcodes.atac_fragment_tsv,Atac_preindex.fragment_file]) File atac_h5ad_out = select_first([ParseBarcodes.atac_h5ad_file, Atac_preindex.snap_metrics]) + output { String pairedtag_pipeline_version_out = pipeline_version @@ -157,6 +158,7 @@ workflow PairedTag { File bam_aligned_output_atac = Atac_preindex.bam_aligned_output File fragment_file_atac = atac_fragment_out File snap_metrics_atac = atac_h5ad_out + File atac_library_final = Atac_preindex.library_metrics_file # optimus outputs File genomic_reference_version_gex = Optimus.genomic_reference_version From 45bf059f0effc06e1a66624810411eb4ccc930c4 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 29 Aug 2024 16:47:21 +0000 Subject: [PATCH 13/18] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index f8a8c4a1a..a01deead2 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,8 +1,8 @@ Pipeline Name Version Date of Last Commit Optimus 7.6.0 2024-08-06 -Multiome 5.5.0 2024-08-06 -PairedTag 1.5.0 2024-08-06 -atac 2.2.3 2024-08-02 +Multiome 5.6.0 2024-08-02 +PairedTag 1.6.0 2024-08-02 +atac 2.3.0 2024-08-29 SlideSeq 3.4.0 2024-08-06 snm3C 4.0.4 2024-08-06 MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02 From 63f5b1988d79b7d76a198252e6bc913a7d69302e Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 12:58:18 -0400 Subject: [PATCH 14/18] Update README.md --- pipelines/skylab/paired_tag/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/paired_tag/README.md b/pipelines/skylab/paired_tag/README.md index b00f015d6..97a801a49 100644 --- a/pipelines/skylab/paired_tag/README.md +++ b/pipelines/skylab/paired_tag/README.md @@ -1,6 +1,6 @@ ## Announcing a new site for WARP documentation! -Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)! +Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)! ### Paired-Tag summary From 0eb8920bd4dd8d204ad7c5dd85be58dc14ce084c Mon Sep 17 00:00:00 2001 From: ekiernan Date: Thu, 29 Aug 2024 14:06:55 -0400 Subject: [PATCH 15/18] fixed docker version --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index bc4d0c11f..a1b19acdb 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -58,7 +58,7 @@ workflow ATAC { String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" - String snap_atac_docker = "snapatac2:lk-PD-2738" + String snap_atac_docker = "snapatac2:1.1.0" # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { From aad3ddcc8dd8124d352780fc83a4643ec3331b38 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Fri, 6 Sep 2024 08:37:10 -0400 Subject: [PATCH 16/18] updating memory on CreateFragment to 64 GB --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index a1b19acdb..b207e393f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -507,7 +507,7 @@ task CreateFragmentFile { File annotations_gtf Boolean preindex Int disk_size = 500 - Int mem_size = 16 + Int mem_size = 64 Int nthreads = 4 String cpuPlatform = "Intel Cascade Lake" String docker_path From d8aa1b20768d30866169c5b9fdf06cd15508f625 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Fri, 6 Sep 2024 12:37:34 +0000 Subject: [PATCH 17/18] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 46 +++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index a01deead2..4d1205506 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,42 +1,42 @@ Pipeline Name Version Date of Last Commit -Optimus 7.6.0 2024-08-06 -Multiome 5.6.0 2024-08-02 +MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02 +MultiSampleSmartSeq2 2.2.21 2023-04-19 PairedTag 1.6.0 2024-08-02 +Optimus 7.6.0 2024-08-06 atac 2.3.0 2024-08-29 -SlideSeq 3.4.0 2024-08-06 snm3C 4.0.4 2024-08-06 -MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02 -scATAC 1.3.2 2023-08-03 SmartSeq2SingleSample 5.1.20 2023-04-19 +Multiome 5.6.0 2024-08-02 +scATAC 1.3.2 2023-08-03 BuildIndices 3.0.0 2023-12-06 -MultiSampleSmartSeq2 2.2.21 2023-04-19 -CEMBA 1.1.6 2023-12-18 +SlideSeq 3.4.0 2024-08-06 BuildCembaReferences 1.0.0 2020-11-15 -UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02 +CEMBA 1.1.6 2023-12-18 GDCWholeGenomeSomaticSingleSample 1.3.2 2024-08-02 -ExomeGermlineSingleSample 3.1.22 2024-06-12 -UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02 -WholeGenomeGermlineSingleSample 3.2.1 2024-06-12 -VariantCalling 2.2.1 2024-06-12 +UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02 +JointGenotypingByChromosomePartOne 1.4.12 2023-12-18 +JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18 UltimaGenomicsJointGenotyping 1.1.7 2023-12-18 JointGenotyping 1.6.10 2023-12-18 ReblockGVCF 2.2.1 2024-06-12 -JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18 -JointGenotypingByChromosomePartOne 1.4.12 2023-12-18 -ExternalExomeReprocessing 3.2.2 2024-08-02 -ExternalWholeGenomeReprocessing 2.2.2 2024-08-02 -ExomeReprocessing 3.2.2 2024-08-02 -CramToUnmappedBams 1.1.3 2024-08-02 -WholeGenomeReprocessing 3.2.2 2024-08-02 -IlluminaGenotypingArray 1.12.21 2024-08-02 -Arrays 2.6.27 2024-08-02 -MultiSampleArrays 1.6.2 2024-08-02 +VariantCalling 2.2.1 2024-06-12 +WholeGenomeGermlineSingleSample 3.2.1 2024-06-12 +UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02 +ExomeGermlineSingleSample 3.1.22 2024-06-12 ValidateChip 1.16.5 2024-08-02 +Arrays 2.6.27 2024-08-02 Imputation 1.1.13 2024-05-21 -RNAWithUMIsPipeline 1.0.16 2023-12-18 +MultiSampleArrays 1.6.2 2024-08-02 BroadInternalUltimaGenomics 1.0.21 2024-08-02 BroadInternalArrays 1.1.11 2024-08-02 BroadInternalImputation 1.1.12 2024-08-02 BroadInternalRNAWithUMIs 1.0.33 2024-08-02 +CramToUnmappedBams 1.1.3 2024-08-02 +ExternalWholeGenomeReprocessing 2.2.2 2024-08-02 +ExternalExomeReprocessing 3.2.2 2024-08-02 +WholeGenomeReprocessing 3.2.2 2024-08-02 +ExomeReprocessing 3.2.2 2024-08-02 +IlluminaGenotypingArray 1.12.21 2024-08-02 CheckFingerprint 1.0.20 2024-08-02 AnnotationFiltration 1.2.5 2023-12-18 +RNAWithUMIsPipeline 1.0.16 2023-12-18 From 1d67da79fc69cdffc234a55f251dc47a9253c078 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Fri, 6 Sep 2024 09:00:43 -0400 Subject: [PATCH 18/18] Update atac.changelog.md --- pipelines/skylab/atac/atac.changelog.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index 1207b32c1..544fb8ea5 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,7 +1,9 @@ # 2.3.0 2024-08-29 (Date of Last Commit) -* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. +* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. + +* Updated the memory for the CreateFragmentFile task # 2.2.3 2024-08-02 (Date of Last Commit)