Skip to content

Commit

Permalink
GRanges map methods consume now, and new granges map benchmarks.
Browse files Browse the repository at this point in the history
 - `map()` methods in `JoinData` and similar types now consume.
 - Added new `granges map` vs `bedtools map` benchmarks.
 - Prettier Python benchmark comparison code
  • Loading branch information
vsbuffalo committed Feb 26, 2024
1 parent b780fa4 commit 2e32a5a
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 40 deletions.
177 changes: 173 additions & 4 deletions benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@
//! ensure the output is the *exact* same.

use criterion::{criterion_group, criterion_main, Criterion};
use granges::test_utilities::{granges_binary_path, random_bed3file};
use std::process::Command;
use granges::test_utilities::{
granges_binary_path, random_bed3file, random_bed5file, temp_bedfile,
};
use std::{
fs::File,
process::{Command, Stdio},
};

const BED_LENGTH: usize = 100_000;

Expand Down Expand Up @@ -146,8 +151,8 @@ fn bench_flank(c: &mut Criterion) {
}

fn bench_windows(c: &mut Criterion) {
let width = 100_000;
let step = 1_000;
let width = 1_000_000;
let step = 10_000;

// create the benchmark group
let mut group = c.benchmark_group("windows");
Expand Down Expand Up @@ -187,11 +192,175 @@ fn bench_windows(c: &mut Criterion) {
});
}

fn bench_map(c: &mut Criterion) {
let num_ranges = BED_LENGTH;
let width = 100_000;
#[allow(unused_variables)]
let step = 1_000;

// make windows
let windows_file = temp_bedfile();
let granges_windows_output = Command::new(granges_binary_path())
.arg("windows")
.arg("--genome")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--width")
.arg(width.to_string())
// .arg("--step")
// .arg(step.to_string())
.arg("--output")
.arg(windows_file.path())
.output()
.expect("granges windows failed");
assert!(
granges_windows_output.status.success(),
"{:?}",
granges_windows_output
);

let operations = vec!["sum", "min", "max", "mean", "median"];

for operation in operations {
// create the random data BED5
let bedscores_file = random_bed5file(num_ranges);
let bedtools_path = temp_bedfile();

// create the benchmark group
let mut group = c.benchmark_group(format!("map_{}", operation).to_string());

// configure the sample size for the group
group.sample_size(10);
group.bench_function("bedtools_map", |b| {
b.iter(|| {
let bedtools_output_file = File::create(&bedtools_path).unwrap();
let bedtools_output = Command::new("bedtools")
.arg("map")
.arg("-a")
.arg(windows_file.path())
.arg("-b")
.arg(&bedscores_file.path())
.arg("-c")
.arg("5")
.arg("-o")
.arg(operation)
.stdout(Stdio::from(bedtools_output_file))
.output()
.expect("bedtools map failed");

assert!(bedtools_output.status.success());
});
});

group.bench_function("granges_map", |b| {
b.iter(|| {
let granges_output_file = temp_bedfile();
let granges_output = Command::new(granges_binary_path())
.arg("map")
.arg("--genome")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--left")
.arg(windows_file.path())
.arg("--right")
.arg(bedscores_file.path())
.arg("--func")
.arg(operation)
.arg("--output")
.arg(granges_output_file.path())
.output()
.expect("granges map failed");

assert!(granges_output.status.success());
});
});
}
}

fn bench_map_all_operations(c: &mut Criterion) {
let num_ranges = BED_LENGTH;
let width = 100_000;
#[allow(unused_variables)]
let step = 1_000;

// make windows
let windows_file = temp_bedfile();
let granges_windows_output = Command::new(granges_binary_path())
.arg("windows")
.arg("--genome")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--width")
.arg(width.to_string())
// .arg("--step")
// .arg(step.to_string())
.arg("--output")
.arg(windows_file.path())
.output()
.expect("granges windows failed");
assert!(
granges_windows_output.status.success(),
"{:?}",
granges_windows_output
);

// create the random data BED5
let bedscores_file = random_bed5file(num_ranges);
let bedtools_path = temp_bedfile();

// create the benchmark group
let mut group = c.benchmark_group("map_multiple");

// configure the sample size for the group
group.sample_size(10);
group.bench_function("bedtools_map", |b| {
b.iter(|| {
let bedtools_output_file = File::create(&bedtools_path).unwrap();
let bedtools_output = Command::new("bedtools")
.arg("map")
.arg("-a")
.arg(windows_file.path())
.arg("-b")
.arg(&bedscores_file.path())
.arg("-c")
.arg("5")
.arg("-o")
.arg("min,max,mean,sum,median")
.stdout(Stdio::from(bedtools_output_file))
.output()
.expect("bedtools map failed");

assert!(bedtools_output.status.success());
});
});

group.bench_function("granges_map", |b| {
b.iter(|| {
let granges_output_file = temp_bedfile();
let granges_output = Command::new(granges_binary_path())
.arg("map")
.arg("--genome")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--left")
.arg(windows_file.path())
.arg("--right")
.arg(bedscores_file.path())
.arg("--func")
.arg("min,max,mean,sum,median")
.arg("--output")
.arg(granges_output_file.path())
.output()
.expect("granges map failed");

assert!(granges_output.status.success());
});
});
}

criterion_group!(
benches,
bench_filter_adjustment,
bench_range_adjustment,
bench_flank,
bench_windows,
bench_map,
bench_map_all_operations,
);
criterion_main!(benches);
31 changes: 27 additions & 4 deletions scripts/benchmark_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
import os
import json
import tabulate


def pretty_time_diff_from_ns(nanoseconds):
units = [
(1, "ns"), # nanoseconds
(1e3, "μs"), # microseconds
(1e6, "ms"), # milliseconds
(1e9, "s"), # seconds
(60e9, "min"), # minutes
(3600e9, "h"), # hours
(86400e9, "d") # days
]

for factor, unit in units:
if nanoseconds < factor:
break
time_amount = nanoseconds / (factor / 1000)
return f"{time_amount:.2f} {unit}"


def extract_mean_point_estimate(estimates_path):
Expand Down Expand Up @@ -33,15 +52,19 @@ def calculate_ratios(criterion_dir):
comparisons[subcommand] = benches

# Calculate and print ratios for each comparison
# print(comparisons)
headers = ["command", "bedtools time", "granges time", "granges speedup (%)"]
table = []
for comparison, tools in comparisons.items():
if len(tools) == 2:
bedtools_time, granges_time = tools.values()
# Calculate the ratio and convert it to a percentage
percent_faster = ((bedtools_time - granges_time) / bedtools_time) * 100
percent_faster = ((tools["bedtools"] - tools["granges"]) / tools["bedtools"]) * 100
# Format the output to show 3 decimal places
print(
f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools"
)
# print(f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools")
table.append([comparison, pretty_time_diff_from_ns(tools["bedtools"]),
pretty_time_diff_from_ns(tools["granges"]), percent_faster])
print(tabulate.tabulate(table, headers=headers))


def main():
Expand Down
11 changes: 8 additions & 3 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -454,13 +454,18 @@ pub fn granges_map(

// Process all the overlaps.
let result_gr = left_join_gr.map_over_joins(|join_data| {
// Get the "right data" -- the BED5 scores.
let overlap_scores: Vec<f64> = join_data.right_data.into_iter().filter_map(|x| x).collect();
// Get the "right data" -- the BED5 scores
let overlap_scores: Vec<f64> = join_data
.right_data
.into_iter()
// Filter out the `None` values.
.flatten()
.collect();

// Run all operations on the scores.
operations
.iter()
.map(|operation| operation.run(overlap_scores.as_ref()))
.map(|operation| operation.run(&overlap_scores))
.collect::<Vec<DatumType>>()
})?;

Expand Down
15 changes: 10 additions & 5 deletions src/data/operations.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
//! Implementations of various operations on data.
//!
//! # 🐌 Performance Note
//!
//! Currently operations are not much faster than bedtools (11%-13%).
//! These methods can be made faster by looping over data once, collecting
//! the quantities that may make up different statistics.

use clap::ValueEnum;
use num_traits::{Float, ToPrimitive};
Expand Down Expand Up @@ -37,36 +42,36 @@ pub enum Operation {
}

impl Operation {
pub fn run<T: IntoDatumType>(&self, data: &[T]) -> DatumType
pub fn run<T: IntoDatumType + Copy>(&self, data: &[T]) -> DatumType
where
T: Float + Sum<T> + ToPrimitive + Clone + ToString,
{
match self {
Operation::Sum => {
let sum: T = data.iter().cloned().sum();
let sum: T = data.iter().copied().sum();
sum.into_data_type()
}
Operation::Min => {
let min = data
.iter()
.filter(|x| x.is_finite())
.cloned()
.copied()
.min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater));
min.map_or(DatumType::NoValue, |x| x.into_data_type())
}
Operation::Max => {
let max = data
.iter()
.filter(|x| x.is_finite())
.cloned()
.copied()
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Less));
max.map_or(DatumType::NoValue, |x| x.into_data_type())
}
Operation::Mean => {
if data.is_empty() {
DatumType::NoValue
} else {
let sum: T = data.iter().cloned().sum();
let sum: T = data.iter().copied().sum();
let mean = sum / T::from(data.len()).unwrap();
mean.into_data_type()
}
Expand Down
Loading

0 comments on commit 2e32a5a

Please sign in to comment.