From 3e5c09cd7ef78815c3e8721926ea32139a69fba2 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Fri, 16 Feb 2024 16:44:58 -0800
Subject: [PATCH] Criterion-based benchmarks of granged adjust vs bedtools slop

 - Added criterion benchmarks against bedtools
 - New enum pattern for loading BED3 vs BED+
 - clippy/fmt issues resolved
---
 Cargo.toml                     |  5 ++-
 benches/bedtools_comparison.rs | 70 ++++++++++++++++++++++++++++++++++
 src/commands.rs                | 26 ++++++++++---
 src/data/vec.rs                |  5 ---
 src/granges.rs                 | 18 ++++-----
 src/io/file.rs                 |  4 +-
 src/join.rs                    |  2 +
 src/lib.rs                     |  6 +--
 src/main/mod.rs                |  6 +--
 src/ranges/mod.rs              | 60 ++++++++++++++---------------
 src/ranges/operations.rs       |  2 +-
 src/reporting.rs               |  1 +
 src/test_utilities.rs          | 18 ++++++++-
 tests/bedtools_validation.rs   | 31 +++++++--------
 14 files changed, 170 insertions(+), 84 deletions(-)
 create mode 100644 benches/bedtools_comparison.rs

diff --git a/Cargo.toml b/Cargo.toml
index 5024788..cd85509 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -37,7 +37,10 @@ name = "granges"
 path = "src/main/mod.rs"
 
 [dev-dependencies]
+criterion = { version = "0.5.1", features = ["html_reports"] }
 tempfile = "3.10.0"
 
-
+[[bench]]
+name = "bedtools_comparison"
+harness = false
 
diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs
new file mode 100644
index 0000000..8011fc3
--- /dev/null
+++ b/benches/bedtools_comparison.rs
@@ -0,0 +1,70 @@
+//! Benchmarks comparisons against bedtools.
+//!
+//! For comparison accuracy, any benchmarks conducted here must also have
+//! corresponding integration tests in `tests/bedtools_validation.rs`, to
+//! ensure the output is the *exact* same.
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use granges::{commands::granges_random_bed, test_utilities::granges_binary_path};
+use std::process::Command;
+use tempfile::NamedTempFile;
+
+/// Create a random BED3 file based on the hg38 sequence lengths, and write to disk.
+pub fn random_bedfile() -> NamedTempFile {
+    let temp_file = NamedTempFile::new().expect("Failed to create temp file");
+    let random_bedfile_path = temp_file.path().to_path_buf();
+    granges_random_bed(
+        "tests_data/hg38_seqlens.tsv",
+        100_000,
+        Some(&random_bedfile_path),
+        true,
+    )
+    .expect("could not generate random BED file");
+    temp_file
+}
+
+fn bench_range_adjustment(c: &mut Criterion) {
+    // create the benchmark group
+    let mut group = c.benchmark_group("slop vs adjust");
+
+    // create the test data
+    let input_bedfile = random_bedfile();
+
+    // configure the sample size for the group
+    // group.sample_size(10);
+
+    // bedtools slop
+    group.bench_function("bedtools_slop", |b| {
+        b.iter(|| {
+            let bedtools_output = Command::new("bedtools")
+                .arg("slop")
+                .arg("-g")
+                .arg("tests_data/hg38_seqlens.tsv")
+                .arg("-b")
+                .arg("10")
+                .arg("-i")
+                .arg(input_bedfile.path())
+                .output()
+                .expect("bedtools slop failed");
+            assert!(bedtools_output.status.success());
+        });
+    });
+
+    group.bench_function("granges_adjust", |b| {
+        b.iter(|| {
+            let granges_output = Command::new(granges_binary_path())
+                .arg("adjust")
+                .arg("--seqlens")
+                .arg("tests_data/hg38_seqlens.tsv")
+                .arg("--both")
+                .arg("10")
+                .arg("--sort")
+                .arg(input_bedfile.path())
+                .output()
+                .expect("granges adjust failed");
+            assert!(granges_output.status.success());
+        });
+    });
+}
+criterion_group!(benches, bench_range_adjustment);
+criterion_main!(benches);
diff --git a/src/commands.rs b/src/commands.rs
index 52555b5..170b5a6 100644
--- a/src/commands.rs
+++ b/src/commands.rs
@@ -1,7 +1,13 @@
 use std::path::PathBuf;
 
-use crate::{prelude::*, PositionOffset, reporting::{CommandOutput, Report}, io::OutputFile, ranges::operations::adjust_range, test_utilities::random_granges};
-
+use crate::{
+    io::OutputFile,
+    prelude::*,
+    ranges::operations::adjust_range,
+    reporting::{CommandOutput, Report},
+    test_utilities::random_granges,
+    PositionOffset,
+};
 
 /// Adjust the genomic ranges in a bedfile by some specified amount.
 // NOTE: we don't do build the full GRanges objects here, for efficiency.
@@ -57,8 +63,17 @@ pub fn granges_adjust(
                 // we need to sort, so we build up the appropriate type of GRanges
                 // object, depending on if we need to hold data or not.
                 match gr {
-                    GRangesVariant::Empty(ref mut obj) => obj.push_range_empty(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end)?,
-                    GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end, range_adjusted.data)?,
+                    GRangesVariant::Empty(ref mut obj) => obj.push_range_empty(
+                        &range_adjusted.seqname,
+                        range_adjusted.start,
+                        range_adjusted.end,
+                    )?,
+                    GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data(
+                        &range_adjusted.seqname,
+                        range_adjusted.start,
+                        range_adjusted.end,
+                        range_adjusted.data,
+                    )?,
                 }
             }
         } else {
@@ -79,7 +94,6 @@ pub fn granges_adjust(
             GRangesVariant::Empty(obj) => obj.sort().to_bed3(output)?,
             GRangesVariant::Indexed(obj) => obj.sort().to_tsv(output)?,
         }
-
     }
     Ok(CommandOutput::new((), report))
 }
@@ -90,7 +104,7 @@ pub fn granges_random_bed(
     num: u32,
     output: Option<impl Into<PathBuf>>,
     sort: bool,
-    ) -> Result<CommandOutput<()>, GRangesError> {
+) -> Result<CommandOutput<()>, GRangesError> {
     // get the genome info
     let genome = read_seqlens(seqlens)?;
 
diff --git a/src/data/vec.rs b/src/data/vec.rs
index fd5185d..bd42639 100644
--- a/src/data/vec.rs
+++ b/src/data/vec.rs
@@ -2,7 +2,6 @@
 
 use crate::traits::IndexedDataContainer;
 
-
 /// Trait methods for the commonly-used `Vec<U>` data container.
 ///
 /// Note that the associated `Item` type is always a *reference* to the data elements.
@@ -29,7 +28,3 @@ where
         Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
     }
 }
-
-
-
-
diff --git a/src/granges.rs b/src/granges.rs
index 5f1e6af..c6c12d7 100644
--- a/src/granges.rs
+++ b/src/granges.rs
@@ -12,7 +12,7 @@ use crate::{
         vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
         RangeEmpty, RangeIndexed, RangeRecord,
     },
-    traits::{RangeContainer, RangesIterable, TsvSerialize, GenericRange, IndexedDataContainer},
+    traits::{GenericRange, IndexedDataContainer, RangeContainer, RangesIterable, TsvSerialize},
     Position,
 };
 
@@ -71,11 +71,7 @@ impl<R: GenericRange, T> GRanges<VecRanges<R>, T> {
 
     /// Consume this [`GRanges`] object and sort the ranges.
     pub fn sort(mut self) -> Self {
-        self.ranges
-            .values_mut()
-            .for_each(|ranges| {
-                ranges.sort()
-            });
+        self.ranges.values_mut().for_each(|ranges| ranges.sort());
         self
     }
 }
@@ -106,10 +102,11 @@ impl<U> GRanges<VecRangesIndexed, Vec<U>> {
     }
 }
 
-impl<'a, T> GRanges<VecRanges<RangeIndexed>, T> 
-where T: IndexedDataContainer<'a>,
-      T: TsvSerialize,
-      <T as IndexedDataContainer<'a>>::Item: TsvSerialize
+impl<'a, T> GRanges<VecRanges<RangeIndexed>, T>
+where
+    T: IndexedDataContainer<'a>,
+    T: TsvSerialize,
+    <T as IndexedDataContainer<'a>>::Item: TsvSerialize,
 {
     ///
     pub fn to_tsv(&'a self, output: Option<impl Into<PathBuf>>) -> Result<(), GRangesError> {
@@ -126,7 +123,6 @@ where T: IndexedDataContainer<'a>,
         }
         Ok(())
     }
-
 }
 
 impl GRanges<VecRangesEmpty, ()> {
diff --git a/src/io/file.rs b/src/io/file.rs
index ab8665c..b8c27eb 100644
--- a/src/io/file.rs
+++ b/src/io/file.rs
@@ -93,8 +93,7 @@ impl InputFile {
     }
 
     /// Collects comment lines and/or a line at the start of the file.
-    pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) 
-        -> io::Result<bool> {
+    pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result<bool> {
         let mut buf_reader = self.reader()?;
         let mut comments = Vec::new();
         let mut line = String::new();
@@ -123,7 +122,6 @@ impl InputFile {
         Ok(self.skip_lines > 0)
     }
 
-
     /// Detect the number of columns *from the first line*, according to some delimiter.
     /// This is not robust against ragged delimited data formats.
     pub fn detect_columns(&mut self, delim: &str) -> Result<usize, GRangesError> {
diff --git a/src/join.rs b/src/join.rs
index a381a51..8a4cfe5 100644
--- a/src/join.rs
+++ b/src/join.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::all)]
+
 use std::rc::Rc;
 
 use crate::Position;
diff --git a/src/lib.rs b/src/lib.rs
index d39aa23..d9a6782 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,10 +4,10 @@
 
 pub use indexmap;
 
+pub mod data;
 pub mod error;
 pub mod granges;
 pub mod io;
-pub mod data;
 pub mod iterators;
 pub mod join;
 pub mod ranges;
@@ -30,8 +30,8 @@ pub mod prelude {
 
     pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
     pub use crate::traits::{
-        IndexedDataContainer,
-        GeneralRangeRecordIterator, GenericRange, RangesIntoIterable, RangesIterable, TsvSerialize,
+        GeneralRangeRecordIterator, GenericRange, IndexedDataContainer, RangesIntoIterable,
+        RangesIterable, TsvSerialize,
     };
 
     pub use crate::seqlens;
diff --git a/src/main/mod.rs b/src/main/mod.rs
index 8709632..976f5b0 100644
--- a/src/main/mod.rs
+++ b/src/main/mod.rs
@@ -1,7 +1,7 @@
 use std::path::PathBuf;
 
 use clap::{Parser, Subcommand};
-use granges::{prelude::GRangesError, PositionOffset, commands::{granges_adjust}};
+use granges::{commands::granges_adjust, prelude::GRangesError, PositionOffset};
 
 #[cfg(feature = "dev-commands")]
 use granges::commands::granges_random_bed;
@@ -45,7 +45,6 @@ enum Commands {
         /// sort the ranges after adjusting their start and end positions
         #[arg(long)]
         sort: bool,
-
     },
 
     #[cfg(feature = "dev-commands")]
@@ -59,7 +58,7 @@ enum Commands {
         /// an optional output file (standard output will be used if not specified)
         #[arg(long)]
         output: Option<PathBuf>,
-        /// sort the ranges 
+        /// sort the ranges
         #[arg(long)]
         sort: bool,
     },
@@ -83,7 +82,6 @@ fn run() -> Result<(), GRangesError> {
             sort,
         }) => granges_random_bed(seqlens, *num, output.as_ref(), *sort),
         None => {
-
             println!("{}\n", INFO);
             std::process::exit(1);
         }
diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs
index 5669bb5..9a7f5c2 100644
--- a/src/ranges/mod.rs
+++ b/src/ranges/mod.rs
@@ -4,7 +4,7 @@
 
 use crate::{
     error::GRangesError,
-    traits::{GenericRange, TsvSerialize, IndexedDataContainer},
+    traits::{GenericRange, IndexedDataContainer, TsvSerialize},
     Position,
 };
 
@@ -131,13 +131,8 @@ impl<U: TsvSerialize> TsvSerialize for RangeRecord<Option<U>> {
     fn to_tsv(&self) -> String {
         match &self.data {
             None => {
-                format!(
-                    "{}\t{}\t{}",
-                    self.seqname,
-                    self.start,
-                    self.end,
-                    )
-            }, 
+                format!("{}\t{}\t{}", self.seqname, self.start, self.end,)
+            }
             Some(data) => {
                 format!(
                     "{}\t{}\t{}\t{}",
@@ -145,8 +140,7 @@ impl<U: TsvSerialize> TsvSerialize for RangeRecord<Option<U>> {
                     self.start,
                     self.end,
                     data.to_tsv()
-                    )
-
+                )
             }
         }
     }
@@ -160,7 +154,7 @@ impl<U: TsvSerialize> TsvSerialize for RangeRecord<U> {
             self.start,
             self.end,
             self.data.to_tsv()
-            )
+        )
     }
 }
 
@@ -180,7 +174,7 @@ impl RangeEmptyRecord {
             end,
         }
     }
-    pub fn to_record(self, seqnames: &Vec<String>) -> RangeRecord<()> {
+    pub fn to_record(self, seqnames: &[String]) -> RangeRecord<()> {
         RangeRecord {
             seqname: seqnames[self.seqname_index].clone(),
             start: self.start,
@@ -226,17 +220,21 @@ impl RangeIndexedRecord {
             index,
         }
     }
-    pub fn to_record<'a, T>(self, seqnames: &Vec<String>, data: &'a T) 
-        -> RangeRecord<<T as IndexedDataContainer<'a>>::Item> 
-        where T: IndexedDataContainer<'a> + TsvSerialize
-        {
-            RangeRecord {
-                seqname: seqnames[self.seqname_index].clone(),
-                start: self.start,
-                end: self.end,
-                data: data.get_value(self.index),
-            }
+    pub fn to_record<'a, T>(
+        self,
+        seqnames: &[String],
+        data: &'a T,
+    ) -> RangeRecord<<T as IndexedDataContainer<'a>>::Item>
+    where
+        T: IndexedDataContainer<'a> + TsvSerialize,
+    {
+        RangeRecord {
+            seqname: seqnames[self.seqname_index].clone(),
+            start: self.start,
+            end: self.end,
+            data: data.get_value(self.index),
         }
+    }
 }
 
 impl GenericRange for RangeIndexedRecord {
@@ -271,15 +269,15 @@ pub fn validate_range(
     start: Position,
     end: Position,
     length: Position,
-    ) -> Result<(), GRangesError> {
+) -> Result<(), GRangesError> {
     if start > end {
         return Err(GRangesError::InvalidGenomicRange(start, end));
     }
 
     if end >= length {
         return Err(GRangesError::InvalidGenomicRangeForSequence(
-                start, end, length,
-                ));
+            start, end, length,
+        ));
     }
     Ok(())
 }
@@ -293,9 +291,9 @@ mod tests {
     fn test_invalid_range_start_end() {
         let result = validate_range(5, 1, 10);
         assert!(matches!(
-                result,
-                Err(GRangesError::InvalidGenomicRange(5, 1))
-                ));
+            result,
+            Err(GRangesError::InvalidGenomicRange(5, 1))
+        ));
     }
 
     #[test]
@@ -308,8 +306,8 @@ mod tests {
     fn test_invalid_range_length() {
         let result = validate_range(1, 10, 10);
         assert!(matches!(
-                result,
-                Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10))
-                ));
+            result,
+            Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10))
+        ));
     }
 }
diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs
index 00dc4e9..9559872 100644
--- a/src/ranges/operations.rs
+++ b/src/ranges/operations.rs
@@ -34,8 +34,8 @@ pub fn adjust_range<R: GenericRange>(
 
 #[cfg(test)]
 mod tests {
-    use crate::ranges::RangeIndexed;
     use super::*;
+    use crate::ranges::RangeIndexed;
 
     #[test]
     fn test_normal_adjustment() {
diff --git a/src/reporting.rs b/src/reporting.rs
index 3ac2c30..6f2ef02 100644
--- a/src/reporting.rs
+++ b/src/reporting.rs
@@ -1,6 +1,7 @@
 //! Centralized reporting to the user about e.g. fragile operations.
 //!
 
+#[allow(unused)]
 pub struct CommandOutput<U> {
     value: U,
     report: Report,
diff --git a/src/test_utilities.rs b/src/test_utilities.rs
index f1f0d63..057a60c 100644
--- a/src/test_utilities.rs
+++ b/src/test_utilities.rs
@@ -1,6 +1,8 @@
 //! Test cases and test utility functions.
 //!
 
+use std::path::PathBuf;
+
 use crate::{
     create_granges_with_seqlens,
     error::GRangesError,
@@ -15,6 +17,20 @@ use crate::{
 use indexmap::IndexMap;
 use rand::{seq::SliceRandom, thread_rng, Rng};
 
+/// Get the path to the `grange` command line tool after a build.
+/// This is used for integration tests and benchmarks.
+pub fn granges_binary_path() -> PathBuf {
+    let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    path.push("target");
+    path.push(if cfg!(debug_assertions) {
+        "debug"
+    } else {
+        "release"
+    });
+    path.push("granges");
+    path
+}
+
 // Stochastic test ranges defaults
 //
 // This is the random number of range to use in tests.
@@ -82,7 +98,7 @@ pub fn random_granges(
             .get(seqname)
             .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?;
         let (start, end) = random_range(chrom_len);
-        gr.push_range_empty(&seqname, start, end)?;
+        gr.push_range_empty(seqname, start, end)?;
     }
     Ok(gr)
 }
diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs
index 01965a7..11ff373 100644
--- a/tests/bedtools_validation.rs
+++ b/tests/bedtools_validation.rs
@@ -1,26 +1,20 @@
 //! Validation against bedtools
 
-use std::{process::Command, path::PathBuf};
-use granges::commands::granges_random_bed;
+use granges::{commands::granges_random_bed, test_utilities::granges_binary_path};
+use std::process::Command;
 use tempfile::NamedTempFile;
 
-
-fn granges_binary_path() -> PathBuf {
-    let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-    path.push("target");
-    path.push(if cfg!(debug_assertions) { "debug" } else { "release" });
-    path.push("granges");
-    path
-}
-
 #[test]
 fn test_against_bedtools_slop() {
-
     let random_bedfile = NamedTempFile::new().expect("Failed to create temp file");
     let random_bedfile_path = random_bedfile.path().to_path_buf();
-    granges_random_bed("tests_data/hg38_seqlens.tsv", 100_000, 
-                       Some(&random_bedfile_path), true)
-        .expect("could not generate random BED file");
+    granges_random_bed(
+        "tests_data/hg38_seqlens.tsv",
+        100_000,
+        Some(&random_bedfile_path),
+        true,
+    )
+    .expect("could not generate random BED file");
 
     let width = 10;
 
@@ -49,7 +43,8 @@ fn test_against_bedtools_slop() {
     assert!(bedtools_output.status.success());
     assert!(granges_output.status.success());
 
-    assert_eq!(String::from_utf8_lossy(&bedtools_output.stdout), 
-               String::from_utf8_lossy(&granges_output.stdout));
+    assert_eq!(
+        String::from_utf8_lossy(&bedtools_output.stdout),
+        String::from_utf8_lossy(&granges_output.stdout)
+    );
 }
-