From 3e5c09cd7ef78815c3e8721926ea32139a69fba2 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Fri, 16 Feb 2024 16:44:58 -0800 Subject: [PATCH] Criterion-based benchmarks of granged adjust vs bedtools slop - Added criterion benchmarks against bedtools - New enum pattern for loading BED3 vs BED+ - clippy/fmt issues resolved --- Cargo.toml | 5 ++- benches/bedtools_comparison.rs | 70 ++++++++++++++++++++++++++++++++++ src/commands.rs | 26 ++++++++++--- src/data/vec.rs | 5 --- src/granges.rs | 18 ++++----- src/io/file.rs | 4 +- src/join.rs | 2 + src/lib.rs | 6 +-- src/main/mod.rs | 6 +-- src/ranges/mod.rs | 60 ++++++++++++++--------------- src/ranges/operations.rs | 2 +- src/reporting.rs | 1 + src/test_utilities.rs | 18 ++++++++- tests/bedtools_validation.rs | 31 +++++++-------- 14 files changed, 170 insertions(+), 84 deletions(-) create mode 100644 benches/bedtools_comparison.rs diff --git a/Cargo.toml b/Cargo.toml index 5024788..cd85509 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,10 @@ name = "granges" path = "src/main/mod.rs" [dev-dependencies] +criterion = { version = "0.5.1", features = ["html_reports"] } tempfile = "3.10.0" - +[[bench]] +name = "bedtools_comparison" +harness = false diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs new file mode 100644 index 0000000..8011fc3 --- /dev/null +++ b/benches/bedtools_comparison.rs @@ -0,0 +1,70 @@ +//! Benchmarks comparisons against bedtools. +//! +//! For comparison accuracy, any benchmarks conducted here must also have +//! corresponding integration tests in `tests/bedtools_validation.rs`, to +//! ensure the output is the *exact* same. + +use criterion::{criterion_group, criterion_main, Criterion}; +use granges::{commands::granges_random_bed, test_utilities::granges_binary_path}; +use std::process::Command; +use tempfile::NamedTempFile; + +/// Create a random BED3 file based on the hg38 sequence lengths, and write to disk. +pub fn random_bedfile() -> NamedTempFile { + let temp_file = NamedTempFile::new().expect("Failed to create temp file"); + let random_bedfile_path = temp_file.path().to_path_buf(); + granges_random_bed( + "tests_data/hg38_seqlens.tsv", + 100_000, + Some(&random_bedfile_path), + true, + ) + .expect("could not generate random BED file"); + temp_file +} + +fn bench_range_adjustment(c: &mut Criterion) { + // create the benchmark group + let mut group = c.benchmark_group("slop vs adjust"); + + // create the test data + let input_bedfile = random_bedfile(); + + // configure the sample size for the group + // group.sample_size(10); + + // bedtools slop + group.bench_function("bedtools_slop", |b| { + b.iter(|| { + let bedtools_output = Command::new("bedtools") + .arg("slop") + .arg("-g") + .arg("tests_data/hg38_seqlens.tsv") + .arg("-b") + .arg("10") + .arg("-i") + .arg(input_bedfile.path()) + .output() + .expect("bedtools slop failed"); + assert!(bedtools_output.status.success()); + }); + }); + + group.bench_function("granges_adjust", |b| { + b.iter(|| { + let granges_output = Command::new(granges_binary_path()) + .arg("adjust") + .arg("--seqlens") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--both") + .arg("10") + .arg("--sort") + .arg(input_bedfile.path()) + .output() + .expect("granges adjust failed"); + assert!(granges_output.status.success()); + }); + }); +} +criterion_group!(benches, bench_range_adjustment); +criterion_main!(benches); diff --git a/src/commands.rs b/src/commands.rs index 52555b5..170b5a6 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,7 +1,13 @@ use std::path::PathBuf; -use crate::{prelude::*, PositionOffset, reporting::{CommandOutput, Report}, io::OutputFile, ranges::operations::adjust_range, test_utilities::random_granges}; - +use crate::{ + io::OutputFile, + prelude::*, + ranges::operations::adjust_range, + reporting::{CommandOutput, Report}, + test_utilities::random_granges, + PositionOffset, +}; /// Adjust the genomic ranges in a bedfile by some specified amount. // NOTE: we don't do build the full GRanges objects here, for efficiency. @@ -57,8 +63,17 @@ pub fn granges_adjust( // we need to sort, so we build up the appropriate type of GRanges // object, depending on if we need to hold data or not. match gr { - GRangesVariant::Empty(ref mut obj) => obj.push_range_empty(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end)?, - GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end, range_adjusted.data)?, + GRangesVariant::Empty(ref mut obj) => obj.push_range_empty( + &range_adjusted.seqname, + range_adjusted.start, + range_adjusted.end, + )?, + GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data( + &range_adjusted.seqname, + range_adjusted.start, + range_adjusted.end, + range_adjusted.data, + )?, } } } else { @@ -79,7 +94,6 @@ pub fn granges_adjust( GRangesVariant::Empty(obj) => obj.sort().to_bed3(output)?, GRangesVariant::Indexed(obj) => obj.sort().to_tsv(output)?, } - } Ok(CommandOutput::new((), report)) } @@ -90,7 +104,7 @@ pub fn granges_random_bed( num: u32, output: Option>, sort: bool, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; diff --git a/src/data/vec.rs b/src/data/vec.rs index fd5185d..bd42639 100644 --- a/src/data/vec.rs +++ b/src/data/vec.rs @@ -2,7 +2,6 @@ use crate::traits::IndexedDataContainer; - /// Trait methods for the commonly-used `Vec` data container. /// /// Note that the associated `Item` type is always a *reference* to the data elements. @@ -29,7 +28,3 @@ where Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone())) } } - - - - diff --git a/src/granges.rs b/src/granges.rs index 5f1e6af..c6c12d7 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -12,7 +12,7 @@ use crate::{ vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, RangeEmpty, RangeIndexed, RangeRecord, }, - traits::{RangeContainer, RangesIterable, TsvSerialize, GenericRange, IndexedDataContainer}, + traits::{GenericRange, IndexedDataContainer, RangeContainer, RangesIterable, TsvSerialize}, Position, }; @@ -71,11 +71,7 @@ impl GRanges, T> { /// Consume this [`GRanges`] object and sort the ranges. pub fn sort(mut self) -> Self { - self.ranges - .values_mut() - .for_each(|ranges| { - ranges.sort() - }); + self.ranges.values_mut().for_each(|ranges| ranges.sort()); self } } @@ -106,10 +102,11 @@ impl GRanges> { } } -impl<'a, T> GRanges, T> -where T: IndexedDataContainer<'a>, - T: TsvSerialize, - >::Item: TsvSerialize +impl<'a, T> GRanges, T> +where + T: IndexedDataContainer<'a>, + T: TsvSerialize, + >::Item: TsvSerialize, { /// pub fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { @@ -126,7 +123,6 @@ where T: IndexedDataContainer<'a>, } Ok(()) } - } impl GRanges { diff --git a/src/io/file.rs b/src/io/file.rs index ab8665c..b8c27eb 100644 --- a/src/io/file.rs +++ b/src/io/file.rs @@ -93,8 +93,7 @@ impl InputFile { } /// Collects comment lines and/or a line at the start of the file. - pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) - -> io::Result { + pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result { let mut buf_reader = self.reader()?; let mut comments = Vec::new(); let mut line = String::new(); @@ -123,7 +122,6 @@ impl InputFile { Ok(self.skip_lines > 0) } - /// Detect the number of columns *from the first line*, according to some delimiter. /// This is not robust against ragged delimited data formats. pub fn detect_columns(&mut self, delim: &str) -> Result { diff --git a/src/join.rs b/src/join.rs index a381a51..8a4cfe5 100644 --- a/src/join.rs +++ b/src/join.rs @@ -1,3 +1,5 @@ +#![allow(clippy::all)] + use std::rc::Rc; use crate::Position; diff --git a/src/lib.rs b/src/lib.rs index d39aa23..d9a6782 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,10 +4,10 @@ pub use indexmap; +pub mod data; pub mod error; pub mod granges; pub mod io; -pub mod data; pub mod iterators; pub mod join; pub mod ranges; @@ -30,8 +30,8 @@ pub mod prelude { pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; pub use crate::traits::{ - IndexedDataContainer, - GeneralRangeRecordIterator, GenericRange, RangesIntoIterable, RangesIterable, TsvSerialize, + GeneralRangeRecordIterator, GenericRange, IndexedDataContainer, RangesIntoIterable, + RangesIterable, TsvSerialize, }; pub use crate::seqlens; diff --git a/src/main/mod.rs b/src/main/mod.rs index 8709632..976f5b0 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; -use granges::{prelude::GRangesError, PositionOffset, commands::{granges_adjust}}; +use granges::{commands::granges_adjust, prelude::GRangesError, PositionOffset}; #[cfg(feature = "dev-commands")] use granges::commands::granges_random_bed; @@ -45,7 +45,6 @@ enum Commands { /// sort the ranges after adjusting their start and end positions #[arg(long)] sort: bool, - }, #[cfg(feature = "dev-commands")] @@ -59,7 +58,7 @@ enum Commands { /// an optional output file (standard output will be used if not specified) #[arg(long)] output: Option, - /// sort the ranges + /// sort the ranges #[arg(long)] sort: bool, }, @@ -83,7 +82,6 @@ fn run() -> Result<(), GRangesError> { sort, }) => granges_random_bed(seqlens, *num, output.as_ref(), *sort), None => { - println!("{}\n", INFO); std::process::exit(1); } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 5669bb5..9a7f5c2 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -4,7 +4,7 @@ use crate::{ error::GRangesError, - traits::{GenericRange, TsvSerialize, IndexedDataContainer}, + traits::{GenericRange, IndexedDataContainer, TsvSerialize}, Position, }; @@ -131,13 +131,8 @@ impl TsvSerialize for RangeRecord> { fn to_tsv(&self) -> String { match &self.data { None => { - format!( - "{}\t{}\t{}", - self.seqname, - self.start, - self.end, - ) - }, + format!("{}\t{}\t{}", self.seqname, self.start, self.end,) + } Some(data) => { format!( "{}\t{}\t{}\t{}", @@ -145,8 +140,7 @@ impl TsvSerialize for RangeRecord> { self.start, self.end, data.to_tsv() - ) - + ) } } } @@ -160,7 +154,7 @@ impl TsvSerialize for RangeRecord { self.start, self.end, self.data.to_tsv() - ) + ) } } @@ -180,7 +174,7 @@ impl RangeEmptyRecord { end, } } - pub fn to_record(self, seqnames: &Vec) -> RangeRecord<()> { + pub fn to_record(self, seqnames: &[String]) -> RangeRecord<()> { RangeRecord { seqname: seqnames[self.seqname_index].clone(), start: self.start, @@ -226,17 +220,21 @@ impl RangeIndexedRecord { index, } } - pub fn to_record<'a, T>(self, seqnames: &Vec, data: &'a T) - -> RangeRecord<>::Item> - where T: IndexedDataContainer<'a> + TsvSerialize - { - RangeRecord { - seqname: seqnames[self.seqname_index].clone(), - start: self.start, - end: self.end, - data: data.get_value(self.index), - } + pub fn to_record<'a, T>( + self, + seqnames: &[String], + data: &'a T, + ) -> RangeRecord<>::Item> + where + T: IndexedDataContainer<'a> + TsvSerialize, + { + RangeRecord { + seqname: seqnames[self.seqname_index].clone(), + start: self.start, + end: self.end, + data: data.get_value(self.index), } + } } impl GenericRange for RangeIndexedRecord { @@ -271,15 +269,15 @@ pub fn validate_range( start: Position, end: Position, length: Position, - ) -> Result<(), GRangesError> { +) -> Result<(), GRangesError> { if start > end { return Err(GRangesError::InvalidGenomicRange(start, end)); } if end >= length { return Err(GRangesError::InvalidGenomicRangeForSequence( - start, end, length, - )); + start, end, length, + )); } Ok(()) } @@ -293,9 +291,9 @@ mod tests { fn test_invalid_range_start_end() { let result = validate_range(5, 1, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRange(5, 1)) - )); + result, + Err(GRangesError::InvalidGenomicRange(5, 1)) + )); } #[test] @@ -308,8 +306,8 @@ mod tests { fn test_invalid_range_length() { let result = validate_range(1, 10, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) - )); + result, + Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) + )); } } diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index 00dc4e9..9559872 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -34,8 +34,8 @@ pub fn adjust_range( #[cfg(test)] mod tests { - use crate::ranges::RangeIndexed; use super::*; + use crate::ranges::RangeIndexed; #[test] fn test_normal_adjustment() { diff --git a/src/reporting.rs b/src/reporting.rs index 3ac2c30..6f2ef02 100644 --- a/src/reporting.rs +++ b/src/reporting.rs @@ -1,6 +1,7 @@ //! Centralized reporting to the user about e.g. fragile operations. //! +#[allow(unused)] pub struct CommandOutput { value: U, report: Report, diff --git a/src/test_utilities.rs b/src/test_utilities.rs index f1f0d63..057a60c 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -1,6 +1,8 @@ //! Test cases and test utility functions. //! +use std::path::PathBuf; + use crate::{ create_granges_with_seqlens, error::GRangesError, @@ -15,6 +17,20 @@ use crate::{ use indexmap::IndexMap; use rand::{seq::SliceRandom, thread_rng, Rng}; +/// Get the path to the `grange` command line tool after a build. +/// This is used for integration tests and benchmarks. +pub fn granges_binary_path() -> PathBuf { + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + path.push("target"); + path.push(if cfg!(debug_assertions) { + "debug" + } else { + "release" + }); + path.push("granges"); + path +} + // Stochastic test ranges defaults // // This is the random number of range to use in tests. @@ -82,7 +98,7 @@ pub fn random_granges( .get(seqname) .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?; let (start, end) = random_range(chrom_len); - gr.push_range_empty(&seqname, start, end)?; + gr.push_range_empty(seqname, start, end)?; } Ok(gr) } diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 01965a7..11ff373 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -1,26 +1,20 @@ //! Validation against bedtools -use std::{process::Command, path::PathBuf}; -use granges::commands::granges_random_bed; +use granges::{commands::granges_random_bed, test_utilities::granges_binary_path}; +use std::process::Command; use tempfile::NamedTempFile; - -fn granges_binary_path() -> PathBuf { - let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - path.push("target"); - path.push(if cfg!(debug_assertions) { "debug" } else { "release" }); - path.push("granges"); - path -} - #[test] fn test_against_bedtools_slop() { - let random_bedfile = NamedTempFile::new().expect("Failed to create temp file"); let random_bedfile_path = random_bedfile.path().to_path_buf(); - granges_random_bed("tests_data/hg38_seqlens.tsv", 100_000, - Some(&random_bedfile_path), true) - .expect("could not generate random BED file"); + granges_random_bed( + "tests_data/hg38_seqlens.tsv", + 100_000, + Some(&random_bedfile_path), + true, + ) + .expect("could not generate random BED file"); let width = 10; @@ -49,7 +43,8 @@ fn test_against_bedtools_slop() { assert!(bedtools_output.status.success()); assert!(granges_output.status.success()); - assert_eq!(String::from_utf8_lossy(&bedtools_output.stdout), - String::from_utf8_lossy(&granges_output.stdout)); + assert_eq!( + String::from_utf8_lossy(&bedtools_output.stdout), + String::from_utf8_lossy(&granges_output.stdout) + ); } -