From 2e32a5aaba8e775fcd512d8b5f18e6c7ba8865dc Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Mon, 26 Feb 2024 00:09:28 -0800
Subject: [PATCH] GRanges map methods consume now, and new `granges map`
 benchmarks.

 - `map()` methods in `JoinData` and similar types now consume.
 - Added new `granges map` vs `bedtools map` benchmarks.
 - Prettier Python benchmark comparison code
---
 benches/bedtools_comparison.rs | 177 ++++++++++++++++++++++++++++++++-
 scripts/benchmark_summary.py   |  31 +++++-
 src/commands.rs                |  11 +-
 src/data/operations.rs         |  15 ++-
 src/join.rs                    |  30 +++---
 src/lib.rs                     |   7 +-
 tests/bedtools_validation.rs   |   3 +-
 7 files changed, 234 insertions(+), 40 deletions(-)

diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs
index bea3a9a..b518d47 100644
--- a/benches/bedtools_comparison.rs
+++ b/benches/bedtools_comparison.rs
@@ -5,8 +5,13 @@
 //! ensure the output is the *exact* same.
 
 use criterion::{criterion_group, criterion_main, Criterion};
-use granges::test_utilities::{granges_binary_path, random_bed3file};
-use std::process::Command;
+use granges::test_utilities::{
+    granges_binary_path, random_bed3file, random_bed5file, temp_bedfile,
+};
+use std::{
+    fs::File,
+    process::{Command, Stdio},
+};
 
 const BED_LENGTH: usize = 100_000;
 
@@ -146,8 +151,8 @@ fn bench_flank(c: &mut Criterion) {
 }
 
 fn bench_windows(c: &mut Criterion) {
-    let width = 100_000;
-    let step = 1_000;
+    let width = 1_000_000;
+    let step = 10_000;
 
     // create the benchmark group
     let mut group = c.benchmark_group("windows");
@@ -187,11 +192,175 @@ fn bench_windows(c: &mut Criterion) {
     });
 }
 
+fn bench_map(c: &mut Criterion) {
+    let num_ranges = BED_LENGTH;
+    let width = 100_000;
+    #[allow(unused_variables)]
+    let step = 1_000;
+
+    // make windows
+    let windows_file = temp_bedfile();
+    let granges_windows_output = Command::new(granges_binary_path())
+        .arg("windows")
+        .arg("--genome")
+        .arg("tests_data/hg38_seqlens.tsv")
+        .arg("--width")
+        .arg(width.to_string())
+        // .arg("--step")
+        // .arg(step.to_string())
+        .arg("--output")
+        .arg(windows_file.path())
+        .output()
+        .expect("granges windows failed");
+    assert!(
+        granges_windows_output.status.success(),
+        "{:?}",
+        granges_windows_output
+    );
+
+    let operations = vec!["sum", "min", "max", "mean", "median"];
+
+    for operation in operations {
+        // create the random data BED5
+        let bedscores_file = random_bed5file(num_ranges);
+        let bedtools_path = temp_bedfile();
+
+        // create the benchmark group
+        let mut group = c.benchmark_group(format!("map_{}", operation).to_string());
+
+        // configure the sample size for the group
+        group.sample_size(10);
+        group.bench_function("bedtools_map", |b| {
+            b.iter(|| {
+                let bedtools_output_file = File::create(&bedtools_path).unwrap();
+                let bedtools_output = Command::new("bedtools")
+                    .arg("map")
+                    .arg("-a")
+                    .arg(windows_file.path())
+                    .arg("-b")
+                    .arg(&bedscores_file.path())
+                    .arg("-c")
+                    .arg("5")
+                    .arg("-o")
+                    .arg(operation)
+                    .stdout(Stdio::from(bedtools_output_file))
+                    .output()
+                    .expect("bedtools map failed");
+
+                assert!(bedtools_output.status.success());
+            });
+        });
+
+        group.bench_function("granges_map", |b| {
+            b.iter(|| {
+                let granges_output_file = temp_bedfile();
+                let granges_output = Command::new(granges_binary_path())
+                    .arg("map")
+                    .arg("--genome")
+                    .arg("tests_data/hg38_seqlens.tsv")
+                    .arg("--left")
+                    .arg(windows_file.path())
+                    .arg("--right")
+                    .arg(bedscores_file.path())
+                    .arg("--func")
+                    .arg(operation)
+                    .arg("--output")
+                    .arg(granges_output_file.path())
+                    .output()
+                    .expect("granges map failed");
+
+                assert!(granges_output.status.success());
+            });
+        });
+    }
+}
+
+fn bench_map_all_operations(c: &mut Criterion) {
+    let num_ranges = BED_LENGTH;
+    let width = 100_000;
+    #[allow(unused_variables)]
+    let step = 1_000;
+
+    // make windows
+    let windows_file = temp_bedfile();
+    let granges_windows_output = Command::new(granges_binary_path())
+        .arg("windows")
+        .arg("--genome")
+        .arg("tests_data/hg38_seqlens.tsv")
+        .arg("--width")
+        .arg(width.to_string())
+        // .arg("--step")
+        // .arg(step.to_string())
+        .arg("--output")
+        .arg(windows_file.path())
+        .output()
+        .expect("granges windows failed");
+    assert!(
+        granges_windows_output.status.success(),
+        "{:?}",
+        granges_windows_output
+    );
+
+    // create the random data BED5
+    let bedscores_file = random_bed5file(num_ranges);
+    let bedtools_path = temp_bedfile();
+
+    // create the benchmark group
+    let mut group = c.benchmark_group("map_multiple");
+
+    // configure the sample size for the group
+    group.sample_size(10);
+    group.bench_function("bedtools_map", |b| {
+        b.iter(|| {
+            let bedtools_output_file = File::create(&bedtools_path).unwrap();
+            let bedtools_output = Command::new("bedtools")
+                .arg("map")
+                .arg("-a")
+                .arg(windows_file.path())
+                .arg("-b")
+                .arg(&bedscores_file.path())
+                .arg("-c")
+                .arg("5")
+                .arg("-o")
+                .arg("min,max,mean,sum,median")
+                .stdout(Stdio::from(bedtools_output_file))
+                .output()
+                .expect("bedtools map failed");
+
+            assert!(bedtools_output.status.success());
+        });
+    });
+
+    group.bench_function("granges_map", |b| {
+        b.iter(|| {
+            let granges_output_file = temp_bedfile();
+            let granges_output = Command::new(granges_binary_path())
+                .arg("map")
+                .arg("--genome")
+                .arg("tests_data/hg38_seqlens.tsv")
+                .arg("--left")
+                .arg(windows_file.path())
+                .arg("--right")
+                .arg(bedscores_file.path())
+                .arg("--func")
+                .arg("min,max,mean,sum,median")
+                .arg("--output")
+                .arg(granges_output_file.path())
+                .output()
+                .expect("granges map failed");
+
+            assert!(granges_output.status.success());
+        });
+    });
+}
+
 criterion_group!(
     benches,
     bench_filter_adjustment,
     bench_range_adjustment,
     bench_flank,
     bench_windows,
+    bench_map,
+    bench_map_all_operations,
 );
 criterion_main!(benches);
diff --git a/scripts/benchmark_summary.py b/scripts/benchmark_summary.py
index 73f2c23..2838383 100644
--- a/scripts/benchmark_summary.py
+++ b/scripts/benchmark_summary.py
@@ -1,5 +1,24 @@
 import os
 import json
+import tabulate
+
+
+def pretty_time_diff_from_ns(nanoseconds):
+    units = [
+        (1, "ns"),  # nanoseconds
+        (1e3, "μs"),  # microseconds
+        (1e6, "ms"),  # milliseconds
+        (1e9, "s"),  # seconds
+        (60e9, "min"),  # minutes
+        (3600e9, "h"),  # hours
+        (86400e9, "d")  # days
+    ]
+
+    for factor, unit in units:
+        if nanoseconds < factor:
+            break
+    time_amount = nanoseconds / (factor / 1000)
+    return f"{time_amount:.2f} {unit}"
 
 
 def extract_mean_point_estimate(estimates_path):
@@ -33,15 +52,19 @@ def calculate_ratios(criterion_dir):
             comparisons[subcommand] = benches
 
     # Calculate and print ratios for each comparison
+    # print(comparisons)
+    headers = ["command", "bedtools time", "granges time", "granges speedup (%)"]
+    table = []
     for comparison, tools in comparisons.items():
         if len(tools) == 2:
             bedtools_time, granges_time = tools.values()
             # Calculate the ratio and convert it to a percentage
-            percent_faster = ((bedtools_time - granges_time) / bedtools_time) * 100
+            percent_faster = ((tools["bedtools"] - tools["granges"]) / tools["bedtools"]) * 100
             # Format the output to show 3 decimal places
-            print(
-                f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools"
-            )
+            # print(f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools")
+            table.append([comparison, pretty_time_diff_from_ns(tools["bedtools"]), 
+                          pretty_time_diff_from_ns(tools["granges"]), percent_faster])
+    print(tabulate.tabulate(table, headers=headers))
 
 
 def main():
diff --git a/src/commands.rs b/src/commands.rs
index 2add3d1..03ad443 100644
--- a/src/commands.rs
+++ b/src/commands.rs
@@ -454,13 +454,18 @@ pub fn granges_map(
 
     // Process all the overlaps.
     let result_gr = left_join_gr.map_over_joins(|join_data| {
-        // Get the "right data" -- the BED5 scores.
-        let overlap_scores: Vec<f64> = join_data.right_data.into_iter().filter_map(|x| x).collect();
+        // Get the "right data" -- the BED5 scores
+        let overlap_scores: Vec<f64> = join_data
+            .right_data
+            .into_iter()
+            // Filter out the `None` values.
+            .flatten()
+            .collect();
 
         // Run all operations on the scores.
         operations
             .iter()
-            .map(|operation| operation.run(overlap_scores.as_ref()))
+            .map(|operation| operation.run(&overlap_scores))
             .collect::<Vec<DatumType>>()
     })?;
 
diff --git a/src/data/operations.rs b/src/data/operations.rs
index 6fefbbc..26e6d8f 100644
--- a/src/data/operations.rs
+++ b/src/data/operations.rs
@@ -1,5 +1,10 @@
 //! Implementations of various operations on data.
 //!
+//! # 🐌 Performance Note
+//!
+//! Currently operations are not much faster than bedtools (11%-13%).
+//! These methods can be made faster by looping over data once, collecting
+//! the quantities that may make up different statistics.
 
 use clap::ValueEnum;
 use num_traits::{Float, ToPrimitive};
@@ -37,20 +42,20 @@ pub enum Operation {
 }
 
 impl Operation {
-    pub fn run<T: IntoDatumType>(&self, data: &[T]) -> DatumType
+    pub fn run<T: IntoDatumType + Copy>(&self, data: &[T]) -> DatumType
     where
         T: Float + Sum<T> + ToPrimitive + Clone + ToString,
     {
         match self {
             Operation::Sum => {
-                let sum: T = data.iter().cloned().sum();
+                let sum: T = data.iter().copied().sum();
                 sum.into_data_type()
             }
             Operation::Min => {
                 let min = data
                     .iter()
                     .filter(|x| x.is_finite())
-                    .cloned()
+                    .copied()
                     .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater));
                 min.map_or(DatumType::NoValue, |x| x.into_data_type())
             }
@@ -58,7 +63,7 @@ impl Operation {
                 let max = data
                     .iter()
                     .filter(|x| x.is_finite())
-                    .cloned()
+                    .copied()
                     .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Less));
                 max.map_or(DatumType::NoValue, |x| x.into_data_type())
             }
@@ -66,7 +71,7 @@ impl Operation {
                 if data.is_empty() {
                     DatumType::NoValue
                 } else {
-                    let sum: T = data.iter().cloned().sum();
+                    let sum: T = data.iter().copied().sum();
                     let mean = sum / T::from(data.len()).unwrap();
                     mean.into_data_type()
                 }
diff --git a/src/join.rs b/src/join.rs
index d68cd76..abdcaaf 100644
--- a/src/join.rs
+++ b/src/join.rs
@@ -147,7 +147,7 @@ where
 {
     /// Apply `func` to each element, putting the results into the returned
     /// `Vec<U>`.
-    pub fn map<F, V>(&self, func: F) -> Vec<V>
+    pub fn map<F, V>(self, func: F) -> Vec<V>
     where
         F: Fn(
             CombinedJoinData<
@@ -157,7 +157,7 @@ where
         ) -> V,
     {
         self.joins
-            .iter()
+            .into_iter()
             .map(|join| {
                 let left_data = self.left_data.get_owned(join.left.unwrap());
                 let right_data = join
@@ -169,7 +169,7 @@ where
                     .collect();
 
                 func(CombinedJoinData {
-                    join: join.clone(),
+                    join,
                     left_data,
                     right_data,
                 })
@@ -243,14 +243,14 @@ where
 {
     /// Apply `func` to each element, putting the results into the returned
     /// `Vec<U>`.
-    pub fn map<F, V>(&self, func: F) -> Vec<V>
+    pub fn map<F, V>(self, func: F) -> Vec<V>
     where
         F: Fn(CombinedJoinDataLeftEmpty<<DR as IndexedDataContainer>::OwnedItem>) -> V,
     {
         // TODO/OPTIMIZE: would consuming here (and analagous funcs) be better/faster?
         // Would require a bit of a redesign.
         self.joins
-            .iter()
+            .into_iter()
             .map(|join| {
                 let right_indices = join.rights.as_ref();
                 let right_data = right_indices.map_or(Vec::new(), |indices| {
@@ -260,10 +260,7 @@ where
                         .collect()
                 });
 
-                func(CombinedJoinDataLeftEmpty {
-                    join: join.clone(),
-                    right_data,
-                })
+                func(CombinedJoinDataLeftEmpty { join, right_data })
             })
             .collect()
     }
@@ -334,19 +331,16 @@ where
 {
     /// Apply `func` to each element, putting the results into the returned
     /// `Vec<U>`.
-    pub fn map<F, V>(&self, func: F) -> Vec<V>
+    pub fn map<F, V>(self, func: F) -> Vec<V>
     where
         F: Fn(CombinedJoinDataRightEmpty<<DL as IndexedDataContainer>::OwnedItem>) -> V,
     {
         self.joins
-            .iter()
+            .into_iter()
             .map(|join| {
                 let left_data = self.left_data.get_owned(join.left.unwrap());
 
-                func(CombinedJoinDataRightEmpty {
-                    join: join.clone(),
-                    left_data,
-                })
+                func(CombinedJoinDataRightEmpty { join, left_data })
             })
             .collect()
     }
@@ -412,13 +406,13 @@ impl JoinDataOperations<(), ()> for CombinedJoinDataBothEmpty {
 impl JoinDataBothEmpty {
     /// Apply `func` to each element, putting the results into the returned
     /// `Vec<U>`.
-    pub fn map<F, V>(&self, func: F) -> Vec<V>
+    pub fn map<F, V>(self, func: F) -> Vec<V>
     where
         F: Fn(CombinedJoinDataBothEmpty) -> V,
     {
         self.joins
-            .iter()
-            .map(|join| func(CombinedJoinDataBothEmpty { join: join.clone() }))
+            .into_iter()
+            .map(|join| func(CombinedJoinDataBothEmpty { join }))
             .collect()
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index f1df199..720730b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -30,10 +30,6 @@
 //! - [Loading Data into GRanges](#loading-data-into-granges)
 //! - [Sequence types](#sequence-types)
 //! - [Manipulating GRanges objects](#manipulating-granges-objects)
-//!   - [Creation](#creation)
-//!   - [Range-modifying functions](#range-modifying-functions)
-//!   - [Data-modifying functions](#data-modifying-functions)
-//!   - [Range and Data modifying functions](#range-and-data-modifying-functions)
 //! - [Future Development](#future-development)
 //!   - [Current Limitations](#current-limitations)
 //!   - [Contributing](#contributing)
@@ -317,6 +313,8 @@
 //!  5. Lack of TSV column type inference methods. This would make loading and working with arbitrary
 //!     TSV files with heterogeneous column types easier.
 //!
+//!  ⚠️:Note that the GRanges API, as well as type names, may change.
+//!
 //! ### Contributing
 //!
 //! If you are interested in contributing to GRanges, please contact Vince Buffalo (@vsbuffalo on
@@ -330,6 +328,7 @@
 //!  - ⚠️: indicates a method or type that is unstable (it may change in future implementations),
 //!     or is "sharp" (developer error could cause a panic).
 //!  - 🚀: optimization tip.
+//!  - 🐌: indicates slow code in need of future optimization.
 //!
 //! [`JoinData`]: crate::join::JoinData
 //! [`GRanges`]: crate::granges::GRanges
diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs
index 13d677c..e74c8b7 100644
--- a/tests/bedtools_validation.rs
+++ b/tests/bedtools_validation.rs
@@ -240,9 +240,8 @@ fn test_against_bedtools_map() {
     #[allow(unused_variables)]
     let step = 10_000; // can uncomment lines below to test this
 
-    let windows_file = temp_bedfile();
-
     // make windows
+    let windows_file = temp_bedfile();
     let granges_windows_output = Command::new(granges_binary_path())
         .arg("windows")
         .arg("--genome")