From 2e32a5aaba8e775fcd512d8b5f18e6c7ba8865dc Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Mon, 26 Feb 2024 00:09:28 -0800 Subject: [PATCH] GRanges map methods consume now, and new `granges map` benchmarks. - `map()` methods in `JoinData` and similar types now consume. - Added new `granges map` vs `bedtools map` benchmarks. - Prettier Python benchmark comparison code --- benches/bedtools_comparison.rs | 177 ++++++++++++++++++++++++++++++++- scripts/benchmark_summary.py | 31 +++++- src/commands.rs | 11 +- src/data/operations.rs | 15 ++- src/join.rs | 30 +++--- src/lib.rs | 7 +- tests/bedtools_validation.rs | 3 +- 7 files changed, 234 insertions(+), 40 deletions(-) diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index bea3a9a..b518d47 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -5,8 +5,13 @@ //! ensure the output is the *exact* same. use criterion::{criterion_group, criterion_main, Criterion}; -use granges::test_utilities::{granges_binary_path, random_bed3file}; -use std::process::Command; +use granges::test_utilities::{ + granges_binary_path, random_bed3file, random_bed5file, temp_bedfile, +}; +use std::{ + fs::File, + process::{Command, Stdio}, +}; const BED_LENGTH: usize = 100_000; @@ -146,8 +151,8 @@ fn bench_flank(c: &mut Criterion) { } fn bench_windows(c: &mut Criterion) { - let width = 100_000; - let step = 1_000; + let width = 1_000_000; + let step = 10_000; // create the benchmark group let mut group = c.benchmark_group("windows"); @@ -187,11 +192,175 @@ fn bench_windows(c: &mut Criterion) { }); } +fn bench_map(c: &mut Criterion) { + let num_ranges = BED_LENGTH; + let width = 100_000; + #[allow(unused_variables)] + let step = 1_000; + + // make windows + let windows_file = temp_bedfile(); + let granges_windows_output = Command::new(granges_binary_path()) + .arg("windows") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--width") + .arg(width.to_string()) + // .arg("--step") + // .arg(step.to_string()) + .arg("--output") + .arg(windows_file.path()) + .output() + .expect("granges windows failed"); + assert!( + granges_windows_output.status.success(), + "{:?}", + granges_windows_output + ); + + let operations = vec!["sum", "min", "max", "mean", "median"]; + + for operation in operations { + // create the random data BED5 + let bedscores_file = random_bed5file(num_ranges); + let bedtools_path = temp_bedfile(); + + // create the benchmark group + let mut group = c.benchmark_group(format!("map_{}", operation).to_string()); + + // configure the sample size for the group + group.sample_size(10); + group.bench_function("bedtools_map", |b| { + b.iter(|| { + let bedtools_output_file = File::create(&bedtools_path).unwrap(); + let bedtools_output = Command::new("bedtools") + .arg("map") + .arg("-a") + .arg(windows_file.path()) + .arg("-b") + .arg(&bedscores_file.path()) + .arg("-c") + .arg("5") + .arg("-o") + .arg(operation) + .stdout(Stdio::from(bedtools_output_file)) + .output() + .expect("bedtools map failed"); + + assert!(bedtools_output.status.success()); + }); + }); + + group.bench_function("granges_map", |b| { + b.iter(|| { + let granges_output_file = temp_bedfile(); + let granges_output = Command::new(granges_binary_path()) + .arg("map") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg(windows_file.path()) + .arg("--right") + .arg(bedscores_file.path()) + .arg("--func") + .arg(operation) + .arg("--output") + .arg(granges_output_file.path()) + .output() + .expect("granges map failed"); + + assert!(granges_output.status.success()); + }); + }); + } +} + +fn bench_map_all_operations(c: &mut Criterion) { + let num_ranges = BED_LENGTH; + let width = 100_000; + #[allow(unused_variables)] + let step = 1_000; + + // make windows + let windows_file = temp_bedfile(); + let granges_windows_output = Command::new(granges_binary_path()) + .arg("windows") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--width") + .arg(width.to_string()) + // .arg("--step") + // .arg(step.to_string()) + .arg("--output") + .arg(windows_file.path()) + .output() + .expect("granges windows failed"); + assert!( + granges_windows_output.status.success(), + "{:?}", + granges_windows_output + ); + + // create the random data BED5 + let bedscores_file = random_bed5file(num_ranges); + let bedtools_path = temp_bedfile(); + + // create the benchmark group + let mut group = c.benchmark_group("map_multiple"); + + // configure the sample size for the group + group.sample_size(10); + group.bench_function("bedtools_map", |b| { + b.iter(|| { + let bedtools_output_file = File::create(&bedtools_path).unwrap(); + let bedtools_output = Command::new("bedtools") + .arg("map") + .arg("-a") + .arg(windows_file.path()) + .arg("-b") + .arg(&bedscores_file.path()) + .arg("-c") + .arg("5") + .arg("-o") + .arg("min,max,mean,sum,median") + .stdout(Stdio::from(bedtools_output_file)) + .output() + .expect("bedtools map failed"); + + assert!(bedtools_output.status.success()); + }); + }); + + group.bench_function("granges_map", |b| { + b.iter(|| { + let granges_output_file = temp_bedfile(); + let granges_output = Command::new(granges_binary_path()) + .arg("map") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg(windows_file.path()) + .arg("--right") + .arg(bedscores_file.path()) + .arg("--func") + .arg("min,max,mean,sum,median") + .arg("--output") + .arg(granges_output_file.path()) + .output() + .expect("granges map failed"); + + assert!(granges_output.status.success()); + }); + }); +} + criterion_group!( benches, bench_filter_adjustment, bench_range_adjustment, bench_flank, bench_windows, + bench_map, + bench_map_all_operations, ); criterion_main!(benches); diff --git a/scripts/benchmark_summary.py b/scripts/benchmark_summary.py index 73f2c23..2838383 100644 --- a/scripts/benchmark_summary.py +++ b/scripts/benchmark_summary.py @@ -1,5 +1,24 @@ import os import json +import tabulate + + +def pretty_time_diff_from_ns(nanoseconds): + units = [ + (1, "ns"), # nanoseconds + (1e3, "Ξs"), # microseconds + (1e6, "ms"), # milliseconds + (1e9, "s"), # seconds + (60e9, "min"), # minutes + (3600e9, "h"), # hours + (86400e9, "d") # days + ] + + for factor, unit in units: + if nanoseconds < factor: + break + time_amount = nanoseconds / (factor / 1000) + return f"{time_amount:.2f} {unit}" def extract_mean_point_estimate(estimates_path): @@ -33,15 +52,19 @@ def calculate_ratios(criterion_dir): comparisons[subcommand] = benches # Calculate and print ratios for each comparison + # print(comparisons) + headers = ["command", "bedtools time", "granges time", "granges speedup (%)"] + table = [] for comparison, tools in comparisons.items(): if len(tools) == 2: bedtools_time, granges_time = tools.values() # Calculate the ratio and convert it to a percentage - percent_faster = ((bedtools_time - granges_time) / bedtools_time) * 100 + percent_faster = ((tools["bedtools"] - tools["granges"]) / tools["bedtools"]) * 100 # Format the output to show 3 decimal places - print( - f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools" - ) + # print(f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools") + table.append([comparison, pretty_time_diff_from_ns(tools["bedtools"]), + pretty_time_diff_from_ns(tools["granges"]), percent_faster]) + print(tabulate.tabulate(table, headers=headers)) def main(): diff --git a/src/commands.rs b/src/commands.rs index 2add3d1..03ad443 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -454,13 +454,18 @@ pub fn granges_map( // Process all the overlaps. let result_gr = left_join_gr.map_over_joins(|join_data| { - // Get the "right data" -- the BED5 scores. - let overlap_scores: Vec = join_data.right_data.into_iter().filter_map(|x| x).collect(); + // Get the "right data" -- the BED5 scores + let overlap_scores: Vec = join_data + .right_data + .into_iter() + // Filter out the `None` values. + .flatten() + .collect(); // Run all operations on the scores. operations .iter() - .map(|operation| operation.run(overlap_scores.as_ref())) + .map(|operation| operation.run(&overlap_scores)) .collect::>() })?; diff --git a/src/data/operations.rs b/src/data/operations.rs index 6fefbbc..26e6d8f 100644 --- a/src/data/operations.rs +++ b/src/data/operations.rs @@ -1,5 +1,10 @@ //! Implementations of various operations on data. //! +//! # 🐌 Performance Note +//! +//! Currently operations are not much faster than bedtools (11%-13%). +//! These methods can be made faster by looping over data once, collecting +//! the quantities that may make up different statistics. use clap::ValueEnum; use num_traits::{Float, ToPrimitive}; @@ -37,20 +42,20 @@ pub enum Operation { } impl Operation { - pub fn run(&self, data: &[T]) -> DatumType + pub fn run(&self, data: &[T]) -> DatumType where T: Float + Sum + ToPrimitive + Clone + ToString, { match self { Operation::Sum => { - let sum: T = data.iter().cloned().sum(); + let sum: T = data.iter().copied().sum(); sum.into_data_type() } Operation::Min => { let min = data .iter() .filter(|x| x.is_finite()) - .cloned() + .copied() .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater)); min.map_or(DatumType::NoValue, |x| x.into_data_type()) } @@ -58,7 +63,7 @@ impl Operation { let max = data .iter() .filter(|x| x.is_finite()) - .cloned() + .copied() .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Less)); max.map_or(DatumType::NoValue, |x| x.into_data_type()) } @@ -66,7 +71,7 @@ impl Operation { if data.is_empty() { DatumType::NoValue } else { - let sum: T = data.iter().cloned().sum(); + let sum: T = data.iter().copied().sum(); let mean = sum / T::from(data.len()).unwrap(); mean.into_data_type() } diff --git a/src/join.rs b/src/join.rs index d68cd76..abdcaaf 100644 --- a/src/join.rs +++ b/src/join.rs @@ -147,7 +147,7 @@ where { /// Apply `func` to each element, putting the results into the returned /// `Vec`. - pub fn map(&self, func: F) -> Vec + pub fn map(self, func: F) -> Vec where F: Fn( CombinedJoinData< @@ -157,7 +157,7 @@ where ) -> V, { self.joins - .iter() + .into_iter() .map(|join| { let left_data = self.left_data.get_owned(join.left.unwrap()); let right_data = join @@ -169,7 +169,7 @@ where .collect(); func(CombinedJoinData { - join: join.clone(), + join, left_data, right_data, }) @@ -243,14 +243,14 @@ where { /// Apply `func` to each element, putting the results into the returned /// `Vec`. - pub fn map(&self, func: F) -> Vec + pub fn map(self, func: F) -> Vec where F: Fn(CombinedJoinDataLeftEmpty<::OwnedItem>) -> V, { // TODO/OPTIMIZE: would consuming here (and analagous funcs) be better/faster? // Would require a bit of a redesign. self.joins - .iter() + .into_iter() .map(|join| { let right_indices = join.rights.as_ref(); let right_data = right_indices.map_or(Vec::new(), |indices| { @@ -260,10 +260,7 @@ where .collect() }); - func(CombinedJoinDataLeftEmpty { - join: join.clone(), - right_data, - }) + func(CombinedJoinDataLeftEmpty { join, right_data }) }) .collect() } @@ -334,19 +331,16 @@ where { /// Apply `func` to each element, putting the results into the returned /// `Vec`. - pub fn map(&self, func: F) -> Vec + pub fn map(self, func: F) -> Vec where F: Fn(CombinedJoinDataRightEmpty<
::OwnedItem>) -> V, { self.joins - .iter() + .into_iter() .map(|join| { let left_data = self.left_data.get_owned(join.left.unwrap()); - func(CombinedJoinDataRightEmpty { - join: join.clone(), - left_data, - }) + func(CombinedJoinDataRightEmpty { join, left_data }) }) .collect() } @@ -412,13 +406,13 @@ impl JoinDataOperations<(), ()> for CombinedJoinDataBothEmpty { impl JoinDataBothEmpty { /// Apply `func` to each element, putting the results into the returned /// `Vec`. - pub fn map(&self, func: F) -> Vec + pub fn map(self, func: F) -> Vec where F: Fn(CombinedJoinDataBothEmpty) -> V, { self.joins - .iter() - .map(|join| func(CombinedJoinDataBothEmpty { join: join.clone() })) + .into_iter() + .map(|join| func(CombinedJoinDataBothEmpty { join })) .collect() } } diff --git a/src/lib.rs b/src/lib.rs index f1df199..720730b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,10 +30,6 @@ //! - [Loading Data into GRanges](#loading-data-into-granges) //! - [Sequence types](#sequence-types) //! - [Manipulating GRanges objects](#manipulating-granges-objects) -//! - [Creation](#creation) -//! - [Range-modifying functions](#range-modifying-functions) -//! - [Data-modifying functions](#data-modifying-functions) -//! - [Range and Data modifying functions](#range-and-data-modifying-functions) //! - [Future Development](#future-development) //! - [Current Limitations](#current-limitations) //! - [Contributing](#contributing) @@ -317,6 +313,8 @@ //! 5. Lack of TSV column type inference methods. This would make loading and working with arbitrary //! TSV files with heterogeneous column types easier. //! +//! ⚠ïļ:Note that the GRanges API, as well as type names, may change. +//! //! ### Contributing //! //! If you are interested in contributing to GRanges, please contact Vince Buffalo (@vsbuffalo on @@ -330,6 +328,7 @@ //! - ⚠ïļ: indicates a method or type that is unstable (it may change in future implementations), //! or is "sharp" (developer error could cause a panic). //! - 🚀: optimization tip. +//! - 🐌: indicates slow code in need of future optimization. //! //! [`JoinData`]: crate::join::JoinData //! [`GRanges`]: crate::granges::GRanges diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 13d677c..e74c8b7 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -240,9 +240,8 @@ fn test_against_bedtools_map() { #[allow(unused_variables)] let step = 10_000; // can uncomment lines below to test this - let windows_file = temp_bedfile(); - // make windows + let windows_file = temp_bedfile(); let granges_windows_output = Command::new(granges_binary_path()) .arg("windows") .arg("--genome")