New additional benchmarks for preprint.

- This adds some Snakemake/Python-based benchmarks for clearer comparison to other tools. - Also changed `unwrap()` to `expect()`.
vsbuffalo · May 28, 2024 · e5b19eb · e5b19eb
1 parent 39217cd
commit e5b19eb
Show file tree

Hide file tree

Showing 8 changed files with 351 additions and 1 deletion.
diff --git a/additional_benchmarks/README.md b/additional_benchmarks/README.md
@@ -0,0 +1,7 @@
+# Additional Benchmarks
+
+## Environment
+
+ mamba create -n granges_benchmark -c conda-forge -c bioconda --file requirements.txt --yes
+ pip install snakemake-executor-plugin-slurm
+
diff --git a/additional_benchmarks/Snakefile b/additional_benchmarks/Snakefile
@@ -0,0 +1,221 @@
+import numpy as np
+
+RSCRIPT = "~/.conda/envs/granges_benchmark/bin/Rscript"
+GRANGES = "../target/release/granges"
+SEQLENS = "../tests_data/hg38_seqlens.tsv"
+NREPS = 100
+
+localrules: granges, windows
+
+rule granges:
+ output: GRANGES
+ shell:
+ """
+ (cargo build --release --features=dev-commands)
+ """
+
+## ----- random BED files -----
+
+rule random_bed:
+ input: seqlens=SEQLENS, granges=GRANGES
+ output: "random_bed/{size,[0-9]+}__{rep}.bed.gz"
+ resources:
+ runtime=30,
+ mem_mb_per_cpu=1800,
+ cpus_per_task=28
+ shell:
+ """
+ {input.granges} random-bed --sort --num {wildcards.size} {input.seqlens} | gzip > {output}
+ """
+
+rule random_bed_scores:
+ input: seqlens=SEQLENS, granges=GRANGES
+ output: "random_bed/scores/{size,[0-9]+}__{rep}.bed.gz"
+ resources:
+ runtime=30,
+ mem_mb_per_cpu=1800,
+ cpus_per_task=28
+ shell:
+ """
+ {input.granges} random-bed --sort --num {wildcards.size} --scores {input.seqlens} | gzip > {output}
+ """
+
+rule windows:
+ input: seqlens=SEQLENS, granges=GRANGES
+ output: "windows/width_{width}.bed"
+ shell:
+ """
+ {input.granges} windows --genome {input.seqlens} --width {wildcards.width} --output {output}
+ """
+
+
+## ----- granges filter / bedtools intersect / plyranges join overlap inner -----
+
+rule granges_filter:
+ input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz", 
+ genome=SEQLENS, granges=GRANGES
+ output: "results/granges_filter__size_{size}.bed"
+ benchmark: repeat("benchmarks/granges_filter__size_{size}.tsv", NREPS)
+ resources:
+ partition="savio3",
+ runtime=600,
+ mem_mb=96000,
+ cpus_per_task=40
+ shell:
+ """
+ {input.granges} filter --left {input.a} --right {input.b} --genome {input.genome} > {output}
+ """
+
+rule bedtools_intersect:
+ input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz"
+ output: "results/bedtools_intersect__size_{size}.bed"
+ benchmark: repeat("benchmarks/bedtools_intersect__size_{size}.tsv", NREPS)
+ resources:
+ partition="savio3",
+ runtime=600,
+ mem_mb=96000,
+ cpus_per_task=40
+ shell:
+ """
+ bedtools intersect -a {input.a} -b {input.b} > {output}
+ """
+
+rule plyranges_join_overlap_inner:
+ input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz"
+ output: "results/plyranges_join_overlap_inner__size_{size}.bed"
+ params: rscript = RSCRIPT
+ benchmark: repeat("benchmarks/plyranges_join_overlap_inner__size_{size}.tsv", NREPS)
+ resources:
+ partition="savio3",
+ runtime=600,
+ mem_mb=96000,
+ cpus_per_task=40
+ shell:
+ """
+ {params.rscript} rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} {output}
+ """
+
+## ----- granges map / bedtools map -----
+WINDOWS = "windows/width_100000.bed"
+
+rule granges_map:
+ input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz",
+ genome=SEQLENS, granges=GRANGES
+ output: "results/granges_map__size_{size}.bed"
+ benchmark: repeat("benchmarks/granges_map__size_{size}.tsv", NREPS)
+ resources:
+ partition="savio3",
+ runtime=600,
+ mem_mb=96000,
+ cpus_per_task=40
+ shell:
+ """
+ {input.granges} map --left {input.left} --right {input.right} --genome {input.genome} --func "min,max,mean,sum,median" --output {output}
+ """
+
+rule bedtools_map:
+ input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz"
+ output: "results/bedtools_map__size_{size}.bed"
+ benchmark: repeat("benchmarks/bedtools_map__size_{size}.tsv", NREPS)
+ resources:
+ partition="savio3",
+ runtime=600,
+ mem_mb=96000,
+ cpus_per_task=40
+ shell:
+ """
+ bedtools map -a {input.left} -b {input.right} -c 5 -o "min,max,mean,sum,median" > {output}
+ """
+
+rule plyranges_map:
+ input:
+ left=WINDOWS,
+ right="random_bed/scores/{size}__A.bed.gz"
+ output:
+ "results/plyranges_map__size_{size}.bed"
+ benchmark:
+ repeat("benchmarks/plyranges_map__size_{size}.tsv", NREPS)
+ resources:
+ partition="savio3",
+ runtime=1000,
+ mem_mb=96000,
+ cpus_per_task=40
+ params: rscript = RSCRIPT
+ shell:
+ """
+ {params.rscript} rscripts/plyranges_map.r {input.left} {input.right} {output}
+ """
+
+
+tools = ["bedtools_intersect", "granges_filter", "plyranges_join_overlap_inner"]
+tools += ["bedtools_map", "granges_map", "plyranges_map"]
+
+# BED file sizes
+sizes = np.logspace(3, 7, 10).astype('int')
+
+all_results = expand("results/{tool}__size_{size}.bed", tool=tools, size=sizes)
+
+rule all:
+ input: all_results
+
+
+rule combine:
+ input: expand("benchmarks/{tool}__size_{size}.tsv", tool=tools, size=sizes)
+ output: "combined_benchmarks.tsv"
+ run:
+ import pandas as pd
+
+ def read_tsv(file):
+ return pd.read_csv(file, sep="\t")
+
+ dfs = []
+ for tool in tools:
+ for size in sizes:
+ file = f"benchmarks/{tool}__size_{size}.tsv"
+ df = read_tsv(file)
+ df["tool"] = tool
+ df["size"] = size
+ dfs.append(df)
+
+ combined_df = pd.concat(dfs, ignore_index=True)
+ combined_df.to_csv(output[0], sep="\t", index=False)
+
+rule figure:
+ input: "combined_benchmarks.tsv"
+ output: "benchmark_figure.pdf"
+ run:
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import pandas as pd
+ import numpy as np
+ mpl.use('agg')
+
+ d = pd.read_csv(input[0], sep="\t")
+ ds = d.groupby(['tool', 'size'], as_index=False)['s'].median()
+ ds = ds.loc[ds['size'] > 1e3]
+
+ tools = d['tool'].unique()
+ num_tools = len(tools)
+ colors = plt.cm.Paired.colors
+
+ fig, ax = plt.subplots()
+
+ i = 0
+ for tool in ds['tool'].unique():
+ data = ds[ds['tool'] == tool]
+ tool, cmd = tool.split('_')[:2]
+
+ color = colors[i]
+ i += 1
+ ax.plot(data['size'], data['s'], '-o', label=f"{tool} {cmd}", color=color,)
+
+ ax.semilogx()
+ ax.semilogy()
+ ax.set_xlabel('Number of BED ranges')
+ ax.set_ylabel('Time (s)')
+ ax.legend(fontsize=8, frameon=False)
+ plt.tight_layout()
+ plt.savefig(output[0])
+
+rule merge:
+ input: "benchmark_figure.pdf"
diff --git a/additional_benchmarks/requirements.txt b/additional_benchmarks/requirements.txt
@@ -0,0 +1,6 @@
+snakemake
+numpy
+scipy
+matplotlib
+bedtools
+R
diff --git a/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r b/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r
@@ -0,0 +1,18 @@
+library(plyranges)
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) != 3) {
+ stop("Usage: Rscript script.R <bed_file_a> <bed_file_b> <output_file>")
+}
+
+bed_file_a <- args[1]
+bed_file_b <- args[2]
+output_file <- args[3]
+
+# load data into mem
+granges_a <- read_bed(bed_file_a)
+granges_b <- read_bed(bed_file_b)
+
+intersection_result <- join_overlap_inner(granges_a, granges_b)
+
+write_bed(intersection_result, output_file)
diff --git a/additional_benchmarks/rscripts/plyranges_map.r b/additional_benchmarks/rscripts/plyranges_map.r
@@ -0,0 +1,31 @@
+library(plyranges)
+library(rtracklayer)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+if (length(args) != 3) {
+ stop("Usage: Rscript script.R <bed_file_a> <bed_file_b> <output_file>")
+}
+
+bed_file_a <- args[1]
+bed_file_b <- args[2]
+output_file <- args[3]
+
+# Load data into memory
+granges_a <- read_bed(bed_file_a)
+granges_b <- read_bed(bed_file_b)
+
+# Perform the map operation
+mapped_granges <- granges_a %>%
+ join_overlap_inner(granges_b) %>%
+ group_by(seqnames, start, end, strand) %>%
+ summarise(
+ min = min(score, na.rm = TRUE),
+ max = max(score, na.rm = TRUE),
+ mean = mean(score, na.rm = TRUE),
+ sum = sum(score, na.rm = TRUE),
+ median = median(score, na.rm = TRUE)
+ )
+
+mapped_df <- as.data.frame(mapped_granges)
+write.table(mapped_df, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE)
diff --git a/examples/weighted_mean.rs b/examples/weighted_mean.rs
@@ -0,0 +1,66 @@
+use granges::prelude::*;
+
+// Our overlap data processing function - a weighted mean.
+pub fn weighted_mean_score(join_data: CombinedJoinDataLeftEmpty<Option<f64>>) -> f64 {
+ // Get the widths of overlaps.
+ let overlap_widths = join_data.join().overlap_widths();
+
+ // Get the "right data" -- the BED5 scores.
+ let overlap_scores: Vec<Option<f64>> = join_data
+ .right_data
+ .into_iter()
+ .collect();
+
+ // Calculate the weighted mean
+ let mut score_sum = 0.0;
+ let mut total_width = 0.0;
+ for (score, width) in overlap_scores.iter().zip(overlap_widths) {
+ if let Some(value) = score {
+ score_sum += value * (width as f64);
+ total_width += width as f64;
+ } else {
+ // This is a missing value, a '.' in BED.
+ continue;
+ }
+ }
+ if total_width > 0.0 {
+ score_sum / total_width
+ } else {
+ 0.0
+ }
+}
+
+fn try_main() -> Result<(), granges::error::GRangesError> {
+ // Load in the chromosome lengths.
+ let genome = read_seqlens("tests_data/hg38_seqlens.tsv")?;
+
+ // Create 1Mb windows, non-sliding, chopping off 
+ // remainder.
+ let width = 1_000_000;
+ let windows = GRangesEmpty::from_windows(&genome, width, None, true)?;
+
+ // Create a new iterator over a BED5 file.
+ let scores_iter = Bed5Iterator::new("tests_data/scores.bed.gz")?;
+
+ // Load the ranges from the iterator into a GRanges object.
+ let scores = GRanges::from_iter(scores_iter, &genome)?
+ // Convert to interval trees.
+ .into_coitrees()?
+ // Extract out just the score from the additional 
+ // BED5 columns.
+ .map_data(|bed5_cols| bed5_cols.score)?;
+
+ windows
+ // Do a "left overlap join" with scores.
+ .left_overlaps(&scores)?
+ // Process each overlapping join by applying
+ // a weighted mean function.
+ .map_joins(weighted_mean_score)?
+ // Write to a BED file.
+ .write_to_tsv(Some("tests_data/weighted_means.bed"), &BED_TSV)?;
+ Ok(())
+}
+
+fn main() {
+ try_main().unwrap();
+}
diff --git a/src/granges.rs b/src/granges.rs
@@ -491,7 +491,8 @@ where
  for (seqname, indices) in self.data_indices()? {
  let mut new_data = Vec::new();
  for index in indices {
- let value = data.get(index).unwrap();
+ let value = data.get(index)
+ .expect("Invalid index in GRanges::data_by_seqname(). This indicates invalid data indices (developer error).");
  new_data.push((*value).clone());
  }
  all_new_data.insert(&seqname, new_data)?;

diff --git a/tests_data/scores.bed.gz b/tests_data/scores.bed.gz