GRanges map methods consume now, and new granges map benchmarks.

- `map()` methods in `JoinData` and similar types now consume. - Added new `granges map` vs `bedtools map` benchmarks. - Prettier Python benchmark comparison code
vsbuffalo · Feb 26, 2024 · 2e32a5a · 2e32a5a
1 parent b780fa4
commit 2e32a5a
Show file tree

Hide file tree

Showing 7 changed files with 234 additions and 40 deletions.
diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs
@@ -5,8 +5,13 @@
 //! ensure the output is the *exact* same.
 
 use criterion::{criterion_group, criterion_main, Criterion};
-use granges::test_utilities::{granges_binary_path, random_bed3file};
-use std::process::Command;
+use granges::test_utilities::{
+ granges_binary_path, random_bed3file, random_bed5file, temp_bedfile,
+};
+use std::{
+ fs::File,
+ process::{Command, Stdio},
+};
 
 const BED_LENGTH: usize = 100_000;
 
@@ -146,8 +151,8 @@ fn bench_flank(c: &mut Criterion) {
 }
 
 fn bench_windows(c: &mut Criterion) {
- let width = 100_000;
- let step = 1_000;
+ let width = 1_000_000;
+ let step = 10_000;
 
  // create the benchmark group
  let mut group = c.benchmark_group("windows");
@@ -187,11 +192,175 @@ fn bench_windows(c: &mut Criterion) {
  });
 }
 
+fn bench_map(c: &mut Criterion) {
+ let num_ranges = BED_LENGTH;
+ let width = 100_000;
+ #[allow(unused_variables)]
+ let step = 1_000;
+
+ // make windows
+ let windows_file = temp_bedfile();
+ let granges_windows_output = Command::new(granges_binary_path())
+ .arg("windows")
+ .arg("--genome")
+ .arg("tests_data/hg38_seqlens.tsv")
+ .arg("--width")
+ .arg(width.to_string())
+ // .arg("--step")
+ // .arg(step.to_string())
+ .arg("--output")
+ .arg(windows_file.path())
+ .output()
+ .expect("granges windows failed");
+ assert!(
+ granges_windows_output.status.success(),
+ "{:?}",
+ granges_windows_output
+ );
+
+ let operations = vec!["sum", "min", "max", "mean", "median"];
+
+ for operation in operations {
+ // create the random data BED5
+ let bedscores_file = random_bed5file(num_ranges);
+ let bedtools_path = temp_bedfile();
+
+ // create the benchmark group
+ let mut group = c.benchmark_group(format!("map_{}", operation).to_string());
+
+ // configure the sample size for the group
+ group.sample_size(10);
+ group.bench_function("bedtools_map", |b| {
+ b.iter(|| {
+ let bedtools_output_file = File::create(&bedtools_path).unwrap();
+ let bedtools_output = Command::new("bedtools")
+ .arg("map")
+ .arg("-a")
+ .arg(windows_file.path())
+ .arg("-b")
+ .arg(&bedscores_file.path())
+ .arg("-c")
+ .arg("5")
+ .arg("-o")
+ .arg(operation)
+ .stdout(Stdio::from(bedtools_output_file))
+ .output()
+ .expect("bedtools map failed");
+
+ assert!(bedtools_output.status.success());
+ });
+ });
+
+ group.bench_function("granges_map", |b| {
+ b.iter(|| {
+ let granges_output_file = temp_bedfile();
+ let granges_output = Command::new(granges_binary_path())
+ .arg("map")
+ .arg("--genome")
+ .arg("tests_data/hg38_seqlens.tsv")
+ .arg("--left")
+ .arg(windows_file.path())
+ .arg("--right")
+ .arg(bedscores_file.path())
+ .arg("--func")
+ .arg(operation)
+ .arg("--output")
+ .arg(granges_output_file.path())
+ .output()
+ .expect("granges map failed");
+
+ assert!(granges_output.status.success());
+ });
+ });
+ }
+}
+
+fn bench_map_all_operations(c: &mut Criterion) {
+ let num_ranges = BED_LENGTH;
+ let width = 100_000;
+ #[allow(unused_variables)]
+ let step = 1_000;
+
+ // make windows
+ let windows_file = temp_bedfile();
+ let granges_windows_output = Command::new(granges_binary_path())
+ .arg("windows")
+ .arg("--genome")
+ .arg("tests_data/hg38_seqlens.tsv")
+ .arg("--width")
+ .arg(width.to_string())
+ // .arg("--step")
+ // .arg(step.to_string())
+ .arg("--output")
+ .arg(windows_file.path())
+ .output()
+ .expect("granges windows failed");
+ assert!(
+ granges_windows_output.status.success(),
+ "{:?}",
+ granges_windows_output
+ );
+
+ // create the random data BED5
+ let bedscores_file = random_bed5file(num_ranges);
+ let bedtools_path = temp_bedfile();
+
+ // create the benchmark group
+ let mut group = c.benchmark_group("map_multiple");
+
+ // configure the sample size for the group
+ group.sample_size(10);
+ group.bench_function("bedtools_map", |b| {
+ b.iter(|| {
+ let bedtools_output_file = File::create(&bedtools_path).unwrap();
+ let bedtools_output = Command::new("bedtools")
+ .arg("map")
+ .arg("-a")
+ .arg(windows_file.path())
+ .arg("-b")
+ .arg(&bedscores_file.path())
+ .arg("-c")
+ .arg("5")
+ .arg("-o")
+ .arg("min,max,mean,sum,median")
+ .stdout(Stdio::from(bedtools_output_file))
+ .output()
+ .expect("bedtools map failed");
+
+ assert!(bedtools_output.status.success());
+ });
+ });
+
+ group.bench_function("granges_map", |b| {
+ b.iter(|| {
+ let granges_output_file = temp_bedfile();
+ let granges_output = Command::new(granges_binary_path())
+ .arg("map")
+ .arg("--genome")
+ .arg("tests_data/hg38_seqlens.tsv")
+ .arg("--left")
+ .arg(windows_file.path())
+ .arg("--right")
+ .arg(bedscores_file.path())
+ .arg("--func")
+ .arg("min,max,mean,sum,median")
+ .arg("--output")
+ .arg(granges_output_file.path())
+ .output()
+ .expect("granges map failed");
+
+ assert!(granges_output.status.success());
+ });
+ });
+}
+
 criterion_group!(
  benches,
  bench_filter_adjustment,
  bench_range_adjustment,
  bench_flank,
  bench_windows,
+ bench_map,
+ bench_map_all_operations,
 );
 criterion_main!(benches);
diff --git a/scripts/benchmark_summary.py b/scripts/benchmark_summary.py
@@ -1,5 +1,24 @@
 import os
 import json
+import tabulate
+
+
+def pretty_time_diff_from_ns(nanoseconds):
+ units = [
+ (1, "ns"), # nanoseconds
+ (1e3, "μs"), # microseconds
+ (1e6, "ms"), # milliseconds
+ (1e9, "s"), # seconds
+ (60e9, "min"), # minutes
+ (3600e9, "h"), # hours
+ (86400e9, "d") # days
+ ]
+
+ for factor, unit in units:
+ if nanoseconds < factor:
+ break
+ time_amount = nanoseconds / (factor / 1000)
+ return f"{time_amount:.2f} {unit}"
 
 
 def extract_mean_point_estimate(estimates_path):
@@ -33,15 +52,19 @@ def calculate_ratios(criterion_dir):
  comparisons[subcommand] = benches
 
  # Calculate and print ratios for each comparison
+ # print(comparisons)
+ headers = ["command", "bedtools time", "granges time", "granges speedup (%)"]
+ table = []
  for comparison, tools in comparisons.items():
  if len(tools) == 2:
  bedtools_time, granges_time = tools.values()
  # Calculate the ratio and convert it to a percentage
- percent_faster = ((bedtools_time - granges_time) / bedtools_time) * 100
+ percent_faster = ((tools["bedtools"] - tools["granges"]) / tools["bedtools"]) * 100
  # Format the output to show 3 decimal places
- print(
- f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools"
- )
+ # print(f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools")
+ table.append([comparison, pretty_time_diff_from_ns(tools["bedtools"]), 
+ pretty_time_diff_from_ns(tools["granges"]), percent_faster])
+ print(tabulate.tabulate(table, headers=headers))
 
 
 def main():

diff --git a/src/commands.rs b/src/commands.rs
@@ -454,13 +454,18 @@ pub fn granges_map(
 
  // Process all the overlaps.
  let result_gr = left_join_gr.map_over_joins(|join_data| {
- // Get the "right data" -- the BED5 scores.
- let overlap_scores: Vec<f64> = join_data.right_data.into_iter().filter_map(|x| x).collect();
+ // Get the "right data" -- the BED5 scores
+ let overlap_scores: Vec<f64> = join_data
+ .right_data
+ .into_iter()
+ // Filter out the `None` values.
+ .flatten()
+ .collect();
 
  // Run all operations on the scores.
  operations
  .iter()
- .map(|operation| operation.run(overlap_scores.as_ref()))
+ .map(|operation| operation.run(&overlap_scores))
  .collect::<Vec<DatumType>>()
  })?;
 

diff --git a/src/data/operations.rs b/src/data/operations.rs
@@ -1,5 +1,10 @@
 //! Implementations of various operations on data.
 //!
+//! # 🐌 Performance Note
+//!
+//! Currently operations are not much faster than bedtools (11%-13%).
+//! These methods can be made faster by looping over data once, collecting
+//! the quantities that may make up different statistics.
 
 use clap::ValueEnum;
 use num_traits::{Float, ToPrimitive};
@@ -37,36 +42,36 @@ pub enum Operation {
 }
 
 impl Operation {
- pub fn run<T: IntoDatumType>(&self, data: &[T]) -> DatumType
+ pub fn run<T: IntoDatumType + Copy>(&self, data: &[T]) -> DatumType
  where
  T: Float + Sum<T> + ToPrimitive + Clone + ToString,
  {
  match self {
  Operation::Sum => {
- let sum: T = data.iter().cloned().sum();
+ let sum: T = data.iter().copied().sum();
  sum.into_data_type()
  }
  Operation::Min => {
  let min = data
  .iter()
  .filter(|x| x.is_finite())
- .cloned()
+ .copied()
  .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater));
  min.map_or(DatumType::NoValue, |x| x.into_data_type())
  }
  Operation::Max => {
  let max = data
  .iter()
  .filter(|x| x.is_finite())
- .cloned()
+ .copied()
  .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Less));
  max.map_or(DatumType::NoValue, |x| x.into_data_type())
  }
  Operation::Mean => {
  if data.is_empty() {
  DatumType::NoValue
  } else {
- let sum: T = data.iter().cloned().sum();
+ let sum: T = data.iter().copied().sum();
  let mean = sum / T::from(data.len()).unwrap();
  mean.into_data_type()
  }