From e0b9c3b57cc04847358161beb93316dfc374b403 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Mon, 4 Mar 2024 23:46:23 -0800
Subject: [PATCH] Redesign of how joins are stored in `LeftGroupedJoin`.

 - Previously only overlap and data indices were stored in
   `LeftGroupedJoin`.  However, this makes downstream operations
   that need to compare, merge/flatten, etc the right ranges
   harder. So, now a `Vec<RangeTuples>` are stored. `RangeTuple`
   is a new minimal range type with `Option<usize>` index (they
   fit well in `LeftGroupedJoin`, since this option avoids having
   to carry indexed/empty types to this level, which makes prevents
   API clog.

 - `RangeTuple` and functionality added.

 - New experimental method `hist-feature` `--reduce` option which
   forces overlapping right ranges to create a new set of their
   feature types, e.g. "CDS,exon,5UTR", which allows for exhaustive
   and exclusive assignment of each basepair in a window to a particular
   "feature set". This isn't tested yet, hence experimental.

 - Commented out `IndexedDataContainer` methods that weren't used and
   just getting in the way.

 - New `UniqueIdentifier` type which is useful for key to `usize`
   mapping. Then, implemntations for `UniqueIdentifier` so it is a
   `IndexedDataContainer`. This allows for much more efficient storage
   of data in indexed data containers when one value is repeated a lot.
   Added `GRanges::new_vec_keyed()` and `GRanges::push_range_with_key()`
   to implement these methods (the former's name may change).

 - `RangeReduced` type, which stores "reduced" or flattend ranges.
   This is used in the `--reduce` option of `hist-features`.

 - `ConditionalMergingIterator` -- without the `Result`.

 - `reduce_ranges()` method.
---
 src/commands.rs          | 235 ++++++++++++++++++++++++-------
 src/data/ndarray.rs      |  32 ++---
 src/data/vec.rs          |   8 +-
 src/granges.rs           |  53 ++++++-
 src/join.rs              | 289 +++++++++++++++++++++++++++++++--------
 src/lib.rs               |   5 +-
 src/main/mod.rs          |  24 ++--
 src/merging_iterators.rs | 113 +++++++++++++++
 src/traits.rs            |  12 +-
 src/unique_id.rs         | 210 ++++++++++++++++++++++++++++
 10 files changed, 835 insertions(+), 146 deletions(-)
 create mode 100644 src/unique_id.rs
diff --git a/src/commands.rs b/src/commands.rs
index 8b9c6d9..f34c634 100644
--- a/src/commands.rs
+++ b/src/commands.rs
@@ -23,6 +23,7 @@ use crate::{
     ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty},
     reporting::{CommandOutput, Report},
     test_utilities::{random_granges, random_granges_mock_bed5},
+    unique_id::UniqueIdentifier,
     Position, PositionOffset,
 };
 
@@ -723,7 +724,7 @@ fn transpose<T>(v: Vec<Vec<T>>) -> Vec<Vec<T>> {
 /// This is useful for a quick exploratory look at feature density
 /// by window, e.g. with
 ///
-///  $ hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \
+///  $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \
 ///    hg38_seqlens.tsv --headers  | xsv table -d'\t' | less
 ///
 /// The xsv tool (https://github.com/BurntSushi/xsv) is useful for visualizing
@@ -750,7 +751,20 @@ pub struct HistFeatures {
     #[arg(short, long)]
     chop: bool,
 
+    /// Assign each basepair exclusively to a single "feature set" 
+    /// based on what features overlap it, and count the number of 
+    /// these per window. E.g. a region that overlaps "CDS" and "exon"
+    /// will be called a "CDS,exon" feature. Columns are sorted
+    /// in descending order by column sums. Headers will always be
+    /// used in this case.
+    ///
+    /// ⚠️  Warning! This feature is *experimental* and currently does not
+    /// have unit or integration tests.
+    #[arg(short, long)]
+    exclusive: bool,
+
     /// Whether to output the results as a TSV with headers.
+    /// When --classify is set, headers will always be output.
     #[arg(short = 'l', long)]
     headers: bool,
 
@@ -763,71 +777,194 @@ impl HistFeatures {
     pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
         let bedfile = &self.bedfile;
         let genome = read_seqlens(&self.genome)?;
-
         let bed4_iter = Bed4Iterator::new(bedfile)?;
 
-        // Split the elements in the iterator by feature into multiple GRanges objects.
-        let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> = HashMap::new();
-        for result in bed4_iter {
-            let range = result?;
-            let feature = &range.data.name;
-            let vec = records_by_features.entry(feature.to_string()).or_default();
-            vec.push(range.into_empty()) // drop the feature, since we're hashing on it
-        }
+        if !self.exclusive {
+            // Split the elements in the iterator by feature into multiple GRanges objects.
+            let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> =
+                HashMap::new();
+            for result in bed4_iter {
+                let range = result?;
+                let feature = &range.data.name;
+                let vec = records_by_features.entry(feature.to_string()).or_default();
+                vec.push(range.into_empty()) // drop the feature, since we're hashing on it
+            }
 
-        // Load all the *merged* split records into memory as interval trees.
-        let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
-        for (feature, ranges) in records_by_features.into_iter() {
-            // doing a streaming merge of each feature; bookend merge
-            let merging_iter = MergingEmptyIterator::new(ranges, 0);
+            // Load all the *merged* split records into memory as interval trees.
+            let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
+            for (feature, ranges) in records_by_features.into_iter() {
+                // doing a streaming merge of each feature; bookend merge
+                let merging_iter = MergingEmptyIterator::new(ranges, 0);
 
-            // load into memory and convert to interval trees
-            let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;
+                // load into memory and convert to interval trees
+                let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;
 
-            assert!(!gr_by_features.contains_key(&feature));
-            gr_by_features.insert(feature, gr);
-        }
+                assert!(!gr_by_features.contains_key(&feature));
+                gr_by_features.insert(feature, gr);
+            }
 
-        // Now, create windows.
-        let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
-
-        let features: Vec<_> = gr_by_features.keys().cloned().collect();
-        let mut feature_matrix = Vec::new();
-        for (_feature, gr) in gr_by_features.into_iter() {
-            // find the features that overlap each window
-            // TODO/OPTIMIZE: how costly is this clone?
-            let window_counts = windows.clone()
-                .left_overlaps(&gr)?
-                .map_joins(|joins| {
-                    // these are merged, so this is the number of *unique* basepairs
-                    let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
-                    total_overlaps
-                })?
-                .take_data()?;
-            feature_matrix.push(window_counts);
-        }
+            // Now, create windows.
+            let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+
+            let features: Vec<_> = gr_by_features.keys().cloned().collect();
+            let mut feature_matrix = Vec::new();
+            for (_feature, gr) in gr_by_features.into_iter() {
+                // find the features that overlap each window
+                // TODO/OPTIMIZE: how costly is this clone?
+                let window_counts = windows
+                    .clone()
+                    .left_overlaps(&gr)?
+                    .map_joins(|joins| {
+                        // these are merged, so this is the number of *unique* basepairs
+                        let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
+                        total_overlaps
+                    })?
+                    .take_data()?;
+                feature_matrix.push(window_counts);
+            }
 
-        let feature_matrix = transpose(feature_matrix);
+            let feature_matrix = transpose(feature_matrix);
 
-        // Unite the windows with the feature matrix.
-        let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
-        let windows = windows.into_granges_data(feature_matrix)?;
+            // Unite the windows with the feature matrix.
+            let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+            let windows = windows.into_granges_data(feature_matrix)?;
 
-        // Write everything.
-        if !self.headers {
-            // we have to *manually* add headers (serde needs flat structs otherwise)
-            windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
+            // Write everything.
+            if !self.headers {
+                // we have to *manually* add headers (serde needs flat structs otherwise)
+                windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
+            } else {
+                let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
+                headers.extend(features);
+
+                let config = TsvConfig {
+                    no_value_string: "NA".to_string(),
+                    headers: Some(headers),
+                };
+                windows.write_to_tsv(self.output.as_ref(), &config)?;
+            }
         } else {
+            // This is done a bit differently than the non-classify case
+
+            // Create GRanges of the feature indices.
+            // We do this manually to avoid creating a needless data container.
+            let mut gr = GRanges::new_vec_keyed(&genome);
+            for result in bed4_iter {
+                let range = result?;
+
+                // Note the trick here: we are *re-using* indices.
+                gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?;
+            }
+            let gr = gr.into_coitrees()?;
+
+            // clone the feature map
+            let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone();
+
+            // Create windows.
+            let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+
+            // "Reduce" ranges and
+            let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| {
+                // "reduce" the ranges to a minimum spanning set that contains a vec of indices
+                let ranges = join.join.reduce_ranges();
+                let mut feature_overlaps: HashMap<Vec<usize>, _> = HashMap::new();
+                for range in ranges.iter() {
+                    let mut key: Vec<usize> = range.indices().iter().map(|x| x.unwrap()).collect();
+                    key.sort(); // sorted key is compound key
+                    *feature_overlaps.entry(key).or_insert(0) += range.width();
+                }
+
+                feature_overlaps
+            })?;
+
+            // Now, we go over the data and get the observed sets
+            let mut observed_sets = UniqueIdentifier::new();
+            let data = windows_overlaps
+                .data()
+                .ok_or(GRangesError::NoDataContainer)?;
+            for feature_counts in data.iter() {
+                for feature_set in feature_counts.keys() {
+                    observed_sets.get_or_insert(feature_set);
+                }
+            }
+            let nsets = observed_sets.len(); // total feature sets
+
+            // Go over data again, making the sparse hashmap into a full
+            // matrix of overlappng basepairs.
+            let gr = windows_overlaps.map_data(|feature_counts| {
+                // fill out matrix
+                let mut counts = vec![0; nsets];
+                for (i, feature_set) in observed_sets.keys().enumerate() {
+                    counts[i] = *feature_counts.get(feature_set).unwrap_or(&0);
+                }
+                counts
+            })?;
+
+            // To make this prettier, let's sort by column totals (this
+            // is a little pricey; we can make option later if needed).
+            let matrix = gr.data().ok_or(GRangesError::NoDataContainer)?;
+            let col_totals = column_totals(matrix);
+            let sorted_indices = sorted_indices_by_values(&col_totals);
+
+            let gr = gr.map_data(|row| {
+                let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect();
+                row
+            })?;
+
+            // Create the labels.
+            let feature_sets: Vec<String> = observed_sets
+                .keys()
+                .map(|indices_key| {
+                    let labels: Vec<_> = indices_key
+                        .iter()
+                        .map(|idx| feature_map.get_key(*idx).unwrap().clone())
+                        .collect();
+                    labels.join(",")
+                })
+                .collect();
+
+            // Re-order the headers too by the sorted indices.
+            let feature_sets: Vec<_> = sorted_indices.iter()
+                .map(|i| feature_sets[*i].clone()).collect();
+
             let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
-            headers.extend(features);
+            headers.extend(feature_sets);
 
             let config = TsvConfig {
                 no_value_string: "NA".to_string(),
                 headers: Some(headers),
             };
-            windows.write_to_tsv(self.output.as_ref(), &config)?;
+            gr.write_to_tsv(self.output.as_ref(), &config)?;
         }
-
         Ok(CommandOutput::new((), None))
     }
 }
+
+
+// get column totals 
+fn column_totals(matrix: &Vec<Vec<Position>>) -> Vec<Position> {
+    if matrix.is_empty() || matrix[0].is_empty() {
+        return Vec::new();
+    }
+
+    let num_columns = matrix[0].len();
+    let mut totals = vec![0; num_columns];
+
+    for row in matrix {
+        for (i, &value) in row.iter().enumerate() {
+            if i < totals.len() {
+                totals[i] += value;
+            }
+        }
+    }
+
+    totals
+} 
+
+// get descending indices order
+fn sorted_indices_by_values(values: &[Position]) -> Vec<usize> {
+    let mut indices: Vec<usize> = (0..values.len()).collect();
+    indices.sort_by_key(|&i| std::cmp::Reverse(values[i]));
+    indices
+}
+
diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs
index 1d52436..43050b6 100644
--- a/src/data/ndarray.rs
+++ b/src/data/ndarray.rs
@@ -146,7 +146,7 @@ where
 {
     type Item<'a> = U;
     type OwnedItem = U;
-    type Output = Array1<U>;
+    // type Output = Array1<U>;
 
     fn get_value(&self, index: usize) -> Self::Item<'_> {
         self[index]
@@ -165,9 +165,9 @@ where
         index < self.shape()[0]
     }
 
-    fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
-        Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
-    }
+    // fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+    // Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
+    // }
 }
 
 impl<U> IndexedDataContainer for Array2<U>
@@ -176,7 +176,7 @@ where
 {
     type Item<'a> = ArrayView1<'a, U>;
     type OwnedItem = Array1<U>;
-    type Output = Array2<U>;
+    // type Output = Array2<U>;
 
     fn get_value(&self, index: usize) -> Self::Item<'_> {
         self.row(index)
@@ -194,19 +194,19 @@ where
         index < self.shape()[0]
     }
 
-    fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
-        let cols = self.shape()[1];
+    //fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+    //    let cols = self.shape()[1];
 
-        let rows_data: Vec<U> = indices
-            .iter()
-            .flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
-            .collect();
+    //    let rows_data: Vec<U> = indices
+    //        .iter()
+    //        .flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
+    //        .collect();
 
-        // create a new Array2<U> from the rows
-        // shape is (number of indices, number of columns)
-        Array2::from_shape_vec((indices.len(), cols), rows_data)
-            .expect("Shape and collected data size mismatch")
-    }
+    //    // create a new Array2<U> from the rows
+    //    // shape is (number of indices, number of columns)
+    //    Array2::from_shape_vec((indices.len(), cols), rows_data)
+    //        .expect("Shape and collected data size mismatch")
+    //}
 }
 
 #[cfg(test)]
diff --git a/src/data/vec.rs b/src/data/vec.rs
index 42ccd79..52256c5 100644
--- a/src/data/vec.rs
+++ b/src/data/vec.rs
@@ -11,7 +11,7 @@ where
 {
     type Item<'a> = &'a U where Self: 'a;
     type OwnedItem = U;
-    type Output = Vec<U>;
+    // type Output = Vec<U>;
 
     fn get_value(&self, index: usize) -> Self::Item<'_> {
         self.get(index).unwrap()
@@ -29,7 +29,7 @@ where
         self.get(index).is_some()
     }
 
-    fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
-        Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
-    }
+    // fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+    //    Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
+    //}
 }
diff --git a/src/granges.rs b/src/granges.rs
index cc1fbaf..62e2b8c 100644
--- a/src/granges.rs
+++ b/src/granges.rs
@@ -37,7 +37,7 @@
 //! [`BedlikeIterator`]: crate::io::parsers::BedlikeIterator
 //! [`GRanges::into_coitrees`]: crate::granges::GRanges::into_coitrees
 
-use std::{collections::HashSet, path::PathBuf};
+use std::{collections::HashSet, hash::Hash, path::PathBuf};
 
 use genomap::GenomeMap;
 use indexmap::IndexMap;
@@ -64,6 +64,7 @@ use crate::{
         GenomicRangesTsvSerialize, IndexedDataContainer, IterableRangeContainer, LeftOverlaps,
         RangeContainer,
     },
+    unique_id::UniqueIdentifier,
     Position, PositionOffset,
 };
 
@@ -305,6 +306,48 @@ impl<R: GenericRange, T> GRanges<VecRanges<R>, T> {
     }
 }
 
+impl<K: Clone + std::cmp::Eq + Hash> GRanges<VecRangesIndexed, UniqueIdentifier<K>> {
+    /// Create a new [`GRanges`] object, with a [`UniqueIdentifier`] as a
+    /// data container.
+    pub fn new_vec_keyed(seqlens: &IndexMap<String, Position>) -> Self {
+        let mut ranges = GenomeMap::new();
+        for (seqname, length) in seqlens.iter() {
+            // this should never happen because the error is only if
+            // insert encounters a seqname that's already been inserted -- that
+            // cannot happen here.
+            ranges
+                .insert(seqname, VecRanges::new(*length))
+                .expect("Internal error: please report");
+        }
+        Self { ranges, data: None }
+    }
+    pub fn push_range_with_key(
+        &mut self,
+        seqname: &str,
+        start: Position,
+        end: Position,
+        key: &K,
+    ) -> Result<(), GRangesError> {
+        let data_ref = match self.data {
+            Some(ref mut data) => data,
+            None => {
+                self.data = Some(UniqueIdentifier::new());
+                self.data.as_mut().unwrap() // Safe unwrap because we just inserted a value
+            }
+        };
+
+        let index = data_ref.get_or_insert(key);
+        let range = RangeIndexed::new(start, end, index);
+
+        let range_container = self
+            .ranges
+            .get_mut(seqname)
+            .ok_or(GRangesError::MissingSequence(seqname.to_string()))?;
+        range_container.push_range(range);
+        Ok(())
+    }
+}
+
 impl<R: AdjustableGenericRange, T> GRanges<VecRanges<R>, T> {
     /// Adjust all the ranges in this [`GRanges`] object in place.
     pub fn adjust_ranges(mut self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self {
@@ -805,7 +848,7 @@ where
                 if let Some(right_ranges) = right.ranges.get(seqname) {
                     right_ranges.query(left_range.start(), left_range.end(), |right_range| {
                         // NOTE: right_range is a coitrees::IntervalNode.
-                        join_data.add_right(&left_range, right_range);
+                        join_data.add_right(right_range);
                     });
                 }
                 gr.push_range_with_join(seqname, left_range.start, left_range.end, join_data)?;
@@ -846,7 +889,7 @@ where
                 if let Some(right_ranges) = right.0.ranges.get(seqname) {
                     right_ranges.query(left_range.start(), left_range.end(), |right_range| {
                         // NOTE: right_range is a coitrees::IntervalNode.
-                        join_data.add_right(&left_range, right_range);
+                        join_data.add_right(right_range);
                     });
                 }
                 gr.push_range_with_join(seqname, left_range.start, left_range.end, join_data)?;
@@ -898,7 +941,7 @@ where
                 if let Some(right_ranges) = right.ranges.get(seqname) {
                     right_ranges.query(left_range.start(), left_range.end(), |right_range| {
                         // NOTE: right_range is a coitrees::IntervalNode.
-                        join_data.add_right(&left_range, right_range);
+                        join_data.add_right(right_range);
                     });
                 }
                 gr.push_range_with_join(seqname, left_range.start(), left_range.end(), join_data)?;
@@ -949,7 +992,7 @@ where
                 if let Some(right_ranges) = right.0.ranges.get(seqname) {
                     right_ranges.query(left_range.start(), left_range.end(), |right_range| {
                         // NOTE: right_range is a coitrees::IntervalNode.
-                        join_data.add_right(&left_range, right_range);
+                        join_data.add_right(right_range);
                     });
                 }
                 gr.push_range_with_join(seqname, left_range.start(), left_range.end(), join_data)?;
diff --git a/src/join.rs b/src/join.rs
index 44f08fe..a40f3d2 100644
--- a/src/join.rs
+++ b/src/join.rs
@@ -2,80 +2,174 @@
 //!
 #![allow(clippy::all)]
 
+use std::collections::HashSet;
+
 use crate::{
     traits::{GenericRange, IndexedDataContainer, JoinDataOperations},
     Position,
 };
 
-/// [`LeftGroupedJoin`] contains information about the right ranges
-/// and their degree of overlap with a focal left range. This information
-/// is designed to facilitate downstream statistical sumamries of the
-/// corresponding data in overlapping ranges.
+/// This is a generic range used just in join logic, to avoid
+/// having to handle bringing range types into [`LeftGroupedJoin`],
+/// which clog up the API a bit.
+/// These can represent indexed and empty ranges.
 #[derive(Clone, Debug, PartialEq)]
-pub struct LeftGroupedJoin {
-    /// The data index for the left range.
-    left: Option<usize>,
+pub struct RangeTuple((Position, Position, Option<usize>));
 
-    /// A `Vec` of the indices for the overlapping right ranges.
-    /// This is `None` if the right ranges do not have a data container.
-    rights: Option<Vec<usize>>,
+impl GenericRange for RangeTuple {
+    fn start(&self) -> Position {
+        self.0 .0
+    }
+    fn end(&self) -> Position {
+        self.0 .1
+    }
+    fn index(&self) -> Option<usize> {
+        self.0 .2
+    }
+}
 
-    /// The length of the left range.
-    left_length: Position,
+/// This is a special "reduced" range that stores indices
+/// to multiple data elements.
+#[derive(Clone, Debug, PartialEq)]
+pub struct RangeReduced((Position, Position, HashSet<Option<usize>>));
 
-    /// The lengths of the right ranges.
-    right_lengths: Vec<Position>,
+impl RangeReduced {
+    pub fn indices(&self) -> &HashSet<Option<usize>> {
+        &self.0 .2
+    }
+}
 
-    /// The lengths of the overlaps between the left and right ranges.
-    overlaps: Vec<Position>,
-    // TODO: we may want some simple summary of whether an overlapping range is
-    // up or downstream. I think the cleanest summary is a signed integer
-    // representing the side and degree of non-overlap. E.g. a range
-    // that overlaps another but overhangs the 3' side of the focal left
-    // range by 10bp is +10; if it were 5', it would be -10.
+/// Create a vector of "reduced" or flattened ranges, that stack the indices of each range.
+///
+///
+/// This first finds every range position, sorts, and dedups these. Then,
+/// it iterates through each range of adjacent positions. Each of these
+/// new ranges is guaranteed to be covered by >= 1 range.
+pub fn reduce_ranges<R: GenericRange>(ranges: &Vec<R>) -> Vec<RangeReduced> {
+    let mut range_ends: Vec<Position> = ranges
+        .iter()
+        .flat_map(|range| vec![range.start(), range.end()].into_iter())
+        .collect();
+    range_ends.sort_unstable();
+    range_ends.dedup();
+
+    let mut ranges_reduced = Vec::new();
+    for range in range_ends.windows(2) {
+        let mut indices = HashSet::new();
+        if let [start, end] = range {
+            for range in ranges {
+                if range.start() < *end && range.end() > *start {
+                    indices.insert(range.index());
+                }
+            }
+            if !indices.is_empty() {
+                ranges_reduced.push(RangeReduced((*start, *end, indices)));
+            }
+        }
+    }
+    ranges_reduced
+}
 
-    // left_data: Option<&'a DL>,
-    // right_data: Option<&'a DR>,
+impl GenericRange for RangeReduced {
+    fn start(&self) -> Position {
+        self.0 .0
+    }
+    fn end(&self) -> Position {
+        self.0 .1
+    }
+    // Note: [`RangeReduced`] do not have valid indices,
+    // so this returns [`None`]. (They do have a [`Vec<Option<usize>>`]
+    // of indices, but this has to be accessed through the type.)
+    fn index(&self) -> Option<usize> {
+        None
+    }
+}
+
+/// [`LeftGroupedJoin`] contains information about the right ranges
+/// and their degree of overlap with a focal left range. This information
+/// is designed to facilitate downstream statistical sumamries of the
+/// corresponding data in overlapping ranges.
+#[derive(Clone, Debug, PartialEq)]
+pub struct LeftGroupedJoin {
+    /// The left range.
+    pub left: RangeTuple,
+
+    /// A `Vec` of the right overlapping ranges (unsorted).
+    // NOTE: previously lengths, overlap width, and their data
+    // indices were stored. For just one extra u32 we can store
+    // all the data in the original structure.
+    pub rights: Vec<RangeTuple>,
 }
 
 impl LeftGroupedJoin {
     /// Create a new [`LeftGroupedJoin`].
     pub fn new<R: GenericRange>(left_range: &R) -> Self {
         Self {
-            left: left_range.index(),
-            rights: None,
-            left_length: left_range.width(),
-            right_lengths: Vec::new(),
-            overlaps: Vec::new(),
-            // left_data,
-            // right_data,
+            left: RangeTuple(left_range.as_tuple()),
+            rights: Vec::new(),
         }
     }
     /// Add a right (overlapping) range to this [`LeftGroupedJoin`].
     ///
     // Note: in principle, this can be called on *non-overlapping* right ranges too,
     // for a full-outer join.
-    pub fn add_right<R: GenericRange, Q: GenericRange>(&mut self, left: &R, right: &Q) {
-        if let Some(right_index) = right.index() {
-            // the right range has data -- add to vec, initializing if not there
-            self.rights.get_or_insert_with(Vec::new).push(right_index)
-        }
-        self.right_lengths.push(right.width());
-        self.overlaps.push(left.overlap_width(right));
+    pub fn add_right<R: GenericRange>(&mut self, right: &R) {
+        self.rights.push(RangeTuple(right.as_tuple()))
+    }
+
+    /// Sort the right ranges, for faster downstream processing.
+    pub fn sort_ranges(&mut self) {
+        self.rights.sort_by(|a, b| {
+            a.start()
+                .cmp(&b.start())
+                .then_with(|| a.end().cmp(&b.end()))
+                .then_with(|| a.index().cmp(&b.index()))
+        });
+    }
+
+    /// "Reduce" the ranges into a minimum set, with all
+    /// their indices gathered in a [`Vec<Option<usize>>`].
+    /// This returns a [`Vec<RangeReduced>`].
+    pub fn reduce_ranges(&mut self) -> Vec<RangeReduced> {
+        // we need to trim these by the left range to get the
+        // proper overlaps within this left range
+        let rights: Vec<_> = self
+            .rights
+            .iter()
+            .map(|range| {
+                let (start, end) = range.overlap_range(&self.left).unwrap();
+                RangeTuple((start, end, range.index()))
+            })
+            .collect();
+        reduce_ranges(&rights)
     }
+
     /// Return whether this left range has any [`LeftGroupedJoin`].
     pub fn has_overlaps(&self) -> bool {
-        !self.overlaps.is_empty()
+        !self.overlaps().is_empty()
     }
 
     /// Retrieve the number of right overlaps.
     pub fn num_overlaps(&self) -> usize {
-        self.overlaps.len()
+        self.overlaps().len()
     }
 
     /// Retrieve the right overlaps.
-    pub fn overlaps(&self) -> &Vec<Position> {
-        &self.overlaps
+    pub fn overlaps(&self) -> Vec<Position> {
+        self.rights
+            .iter()
+            .map(|r| r.overlap_width(&self.left))
+            .collect()
+    }
+
+    /// Get the left index.
+    pub fn left_index(&self) -> Option<usize> {
+        self.left.index()
+    }
+
+    /// Get the right indices.
+    pub fn right_indices(&self) -> Vec<Option<usize>> {
+        self.rights.iter().map(|r| r.index()).collect()
     }
 }
 
@@ -164,13 +258,11 @@ where
         self.joins
             .into_iter()
             .map(|join| {
-                let left_data = self.left_data.get_owned(join.left.unwrap());
+                let left_data = self.left_data.get_owned(join.left_index().unwrap());
                 let right_data = join
-                    .rights
-                    .as_ref()
-                    .unwrap()
+                    .right_indices()
                     .iter()
-                    .map(|idx| self.right_data.get_owned(*idx))
+                    .map(|idx| self.right_data.get_owned(idx.unwrap()))
                     .collect();
 
                 func(CombinedJoinData {
@@ -257,13 +349,11 @@ where
         self.joins
             .into_iter()
             .map(|join| {
-                let right_indices = join.rights.as_ref();
-                let right_data = right_indices.map_or(Vec::new(), |indices| {
-                    indices
-                        .iter()
-                        .map(|idx| self.right_data.get_owned(*idx))
-                        .collect()
-                });
+                let right_indices = join.right_indices();
+                let right_data = right_indices
+                    .iter()
+                    .map(|idx| self.right_data.get_owned(idx.unwrap()))
+                    .collect();
 
                 func(CombinedJoinDataLeftEmpty { join, right_data })
             })
@@ -343,7 +433,7 @@ where
         self.joins
             .into_iter()
             .map(|join| {
-                let left_data = self.left_data.get_owned(join.left.unwrap());
+                let left_data = self.left_data.get_owned(join.left_index().unwrap());
 
                 func(CombinedJoinDataRightEmpty { join, left_data })
             })
@@ -443,10 +533,9 @@ impl<'a, DL, DR> Iterator for JoinDataIterator<'a, DL, DR> {
 
 #[cfg(test)]
 mod tests {
+    use super::*;
     use crate::ranges::RangeIndexed;
 
-    use super::{JoinData, LeftGroupedJoin};
-
     #[test]
     fn test_join_data_new() {
         let left_data = vec![1, 2];
@@ -457,8 +546,94 @@ mod tests {
         let left = RangeIndexed::new(0, 10, 1);
         let mut join = LeftGroupedJoin::new(&left);
         let right = RangeIndexed::new(8, 10, 1);
-        join.add_right(&left, &right);
+        join.add_right(&right);
         jd.push(join);
         assert_eq!(jd.len(), 1);
     }
+
+    #[test]
+    fn test_single_range_indexed() {
+        let ranges = vec![RangeIndexed {
+            start: 1,
+            end: 5,
+            index: 0,
+        }];
+        let reduced = reduce_ranges(&ranges);
+
+        assert_eq!(reduced.len(), 1);
+        assert_eq!(reduced[0].0 .0, 1);
+        assert_eq!(reduced[0].0 .1, 5);
+        assert!(reduced[0].0 .2.contains(&Some(0)));
+    }
+
+    #[test]
+    fn test_overlapping_ranges_indexed() {
+        let ranges = vec![
+            RangeIndexed {
+                start: 1,
+                end: 4,
+                index: 0,
+            },
+            RangeIndexed {
+                start: 3,
+                end: 6,
+                index: 1,
+            },
+        ];
+        let reduced = reduce_ranges(&ranges);
+
+        assert_eq!(reduced.len(), 3);
+
+        let mut iter = reduced.iter();
+        let first_range = iter.next().unwrap();
+        assert_eq!(first_range.start(), 1);
+        assert_eq!(first_range.end(), 3);
+        assert_eq!(first_range.indices().len(), 1);
+        assert!(first_range.indices().contains(&Some(0)));
+
+        let second_range = iter.next().unwrap();
+        assert_eq!(second_range.start(), 3);
+        assert_eq!(second_range.end(), 4);
+        assert_eq!(second_range.indices().len(), 2);
+        assert!(second_range.indices().contains(&Some(0)));
+        assert!(second_range.indices().contains(&Some(1)));
+
+        let third_range = iter.next().unwrap();
+        assert_eq!(third_range.start(), 4);
+        assert_eq!(third_range.end(), 6);
+        assert_eq!(third_range.indices().len(), 1);
+        assert!(third_range.indices().contains(&Some(1)));
+    }
+
+    #[test]
+    fn test_non_overlapping_ranges_indexed() {
+        let ranges = vec![
+            RangeIndexed {
+                start: 1,
+                end: 3,
+                index: 0,
+            },
+            RangeIndexed {
+                start: 4,
+                end: 6,
+                index: 1,
+            },
+        ];
+        let reduced = reduce_ranges(&ranges);
+
+        assert_eq!(reduced.len(), 2);
+
+        let mut iter = reduced.iter();
+        let first_range = iter.next().unwrap();
+        assert_eq!(first_range.start(), 1);
+        assert_eq!(first_range.end(), 3);
+        assert_eq!(first_range.indices().len(), 1);
+        assert!(first_range.indices().contains(&Some(0)));
+
+        let second_range = iter.next().unwrap();
+        assert_eq!(second_range.start(), 4);
+        assert_eq!(second_range.end(), 6);
+        assert_eq!(second_range.indices().len(), 1);
+        assert!(second_range.indices().contains(&Some(1)));
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 28d4f77..b09d163 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -364,6 +364,7 @@ pub mod ranges;
 pub mod sequences;
 pub mod test_utilities;
 pub mod traits;
+pub mod unique_id;
 
 // bringing these CLI modules into lib.rs rather than main/ allows for
 // use in integration tests and other Rust-side command line work
@@ -425,8 +426,8 @@ pub mod prelude {
     };
 
     pub use crate::merging_iterators::{
-        ConditionalMergingResultIterator, MergingEmptyIterator, MergingEmptyResultIterator,
-        MergingResultIterator,
+        ConditionalMergingIterator, ConditionalMergingResultIterator, MergingEmptyIterator,
+        MergingEmptyResultIterator, MergingResultIterator,
     };
     pub use crate::traits::{
         AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations,
diff --git a/src/main/mod.rs b/src/main/mod.rs
index 243a3e7..70c6437 100644
--- a/src/main/mod.rs
+++ b/src/main/mod.rs
@@ -20,20 +20,24 @@ usage: granges [--help] <subcommand>
 
 Subcommands:
   
-  adjust:    Adjust each genomic range, e.g. to add a kilobase to each end.
+  adjust:             Adjust each genomic range, e.g. to add a kilobase to each end.
 
-  filter:    Filter the left ranges based on whether they have at least one
-             overlap with a right range. This is equivalent to a filtering
-             "semi-join" in SQL terminology. 
+  filter:             Filter the left ranges based on whether they have at least one
+                      overlap with a right range. This is equivalent to a filtering
+                      "semi-join" in SQL terminology. 
 
-  map:       Compute the left grouped overlaps between the left genomic ranges
-             and right genomic ranges, and apply one or more operations to the 
-             score column of the right BED5 file.
+  hist-features:      Histogram feature coverage per windows. This merges overlapping
+                      ranges first.
 
-  merge:     Merge ranges that are within a minimum distance of each other.
+  map:                Compute the left grouped overlaps between the left genomic ranges
+                      and right genomic ranges, and apply one or more operations to the 
+                      score column of the right BED5 file.
+
+  merge:              Merge ranges that are within a minimum distance of each other.
           
-  windows:   Create a set of genomic windows of the specified width (in basepairs),
-             stepping the specified step size (the width, by default).
+  windows:            Create a set of genomic windows of the specified width (in 
+                      basepairs), stepping the specified step size (the width, by 
+                      default).
           
 
 NOTE: granges is under active development. It is not currently meant to be
diff --git a/src/merging_iterators.rs b/src/merging_iterators.rs
index 7b2d939..308a566 100644
--- a/src/merging_iterators.rs
+++ b/src/merging_iterators.rs
@@ -240,6 +240,119 @@ where
     }
 }
 
+/// A merging iterator (over [`GenomicRangeRecord<U>] items,
+/// i.e. from a parsing iterator) that has an additional merge condition tested
+/// by function `G` based on the items.
+pub struct ConditionalMergingIterator<I, U, V, F, G>
+where
+    I: IntoIterator<Item = GenomicRangeRecord<U>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+    G: Fn(&GenomicRangeRecord<U>, &GenomicRangeRecord<U>) -> bool,
+{
+    last_range: Option<GenomicRangeRecord<U>>,
+    inner: <I as IntoIterator>::IntoIter,
+    minimum_distance: PositionOffset,
+    func: F,
+    group: G,
+    accumulated_data: Vec<U>,
+}
+
+impl<I, U, V, F, G> ConditionalMergingIterator<I, U, V, F, G>
+where
+    I: IntoIterator<Item = GenomicRangeRecord<U>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+    G: Fn(&GenomicRangeRecord<U>, &GenomicRangeRecord<U>) -> bool,
+{
+    pub fn new(inner: I, minimum_distance: PositionOffset, func: F, group: G) -> Self {
+        Self {
+            last_range: None,
+            inner: inner.into_iter(),
+            minimum_distance,
+            func,
+            group,
+            accumulated_data: Vec::new(),
+        }
+    }
+}
+
+impl<I, U, V, F, G> Iterator for ConditionalMergingIterator<I, U, V, F, G>
+where
+    GenomicRangeRecord<U>: GenericRange,
+    GenomicRangeRecord<V>: GenericRange,
+    I: IntoIterator<Item = GenomicRangeRecord<U>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+    G: Fn(&GenomicRangeRecord<U>, &GenomicRangeRecord<U>) -> bool,
+{
+    type Item = GenomicRangeRecord<V>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for next_range in self.inner.by_ref() {
+            if let Some(ref mut last_range) = self.last_range {
+                // If we have an additional group-by merging condition,
+                // check that. If not set this is always true.
+                let satifies_groupby = (self.group)(last_range, &next_range);
+
+                let on_same_chrom = last_range.seqname == next_range.seqname;
+                if on_same_chrom
+                    && satifies_groupby
+                    && last_range.distance_or_overlap(&next_range) <= self.minimum_distance
+                {
+                    // this range overlaps the last range, so we keep accumulating data
+                    last_range.end = max(last_range.end, next_range.end);
+                    self.accumulated_data.push(next_range.data);
+                    continue;
+                } else {
+                    // New range does not overlap the last range, so we have to finalize.
+                    // First, run function on accumulated taken data.
+                    let final_data = (self.func)(std::mem::take(&mut self.accumulated_data));
+                    let return_range = GenomicRangeRecord {
+                        seqname: last_range.seqname.clone(),
+                        start: last_range.start,
+                        end: last_range.end,
+                        data: final_data,
+                    };
+                    // Next push this new range's data to the data stack (empty after take)
+                    self.accumulated_data.push(next_range.data.clone());
+                    self.last_range = Some(next_range);
+                    // Push the new possibly-extend range with accumulated data.
+                    return Some(return_range);
+                }
+            } else {
+                // There is no last range -- this must be the first.
+                self.last_range = Some(GenomicRangeRecord {
+                    seqname: next_range.seqname,
+                    start: next_range.start,
+                    end: next_range.end,
+                    data: next_range.data.clone(),
+                });
+                self.accumulated_data.push(next_range.data);
+                continue;
+            }
+        }
+
+        if let Some(last_range) = self.last_range.take() {
+            if !self.accumulated_data.is_empty() {
+                // Finalize any accumulated data
+                let final_data = (self.func)(std::mem::take(&mut self.accumulated_data));
+                return Some(GenomicRangeRecord {
+                    seqname: last_range.seqname,
+                    start: last_range.start,
+                    end: last_range.end,
+                    data: final_data,
+                });
+            }
+        }
+
+        None // Return None when all items have been processed
+    }
+}
+
 /// A merging iterator (over [`Result<GenomicRangeRecord<U>, GRangesError`] items,
 /// i.e. from a parsing iterator) that has an additional merge condition tested
 /// by function `G` based on the items.
diff --git a/src/traits.rs b/src/traits.rs
index a2d062b..43379de 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -79,6 +79,12 @@ pub trait GenericRange: Clone {
         overlap_end.saturating_sub(overlap_start)
     }
 
+    /// Calculate how many basepairs overlap this range and other.
+    fn has_overlap_with<R: GenericRange>(&self, other: &R) -> bool {
+        // TODO/OPTIMIZE inefficient FIXME
+        self.overlap_width(other) > 0
+    }
+
     fn distance_or_overlap<R: GenericRange>(&self, other: &R) -> PositionOffset {
         if self.end() > other.start() && self.start() < other.end() {
             // The ranges overlap -- return how much as a negative number
@@ -308,7 +314,7 @@ pub trait IndexedDataContainer: DataContainer {
     where
         Self: 'a;
     type OwnedItem;
-    type Output;
+    // type Output;
     // this type is needed to reference the core underlying type,
     // eg to handle reference types
     fn is_valid_index(&self, index: usize) -> bool;
@@ -328,8 +334,8 @@ pub trait IndexedDataContainer: DataContainer {
         invalid
     }
 
-    /// Create a new data container using a set of range indices.
-    fn new_from_indices(&self, indices: &[usize]) -> Self::Output;
+    ///// Create a new data container using a set of range indices.
+    // fn new_from_indices(&self, indices: &[usize]) -> Self::Output;
 }
 
 /// The Sequences trait defines generic functionality for
diff --git a/src/unique_id.rs b/src/unique_id.rs
new file mode 100644
index 0000000..cbb7cea
--- /dev/null
+++ b/src/unique_id.rs
@@ -0,0 +1,210 @@
+//! The [`UniqueIdentifier`] type, which stores a mapping between
+//! some key type and a `usize` index.
+//!
+//! Note that these are not thread-safe currently but
+//! it is relatively easy to do -- submit a GitHub
+//! issue if you need this feature.
+
+use std::{
+    cell::RefCell,
+    collections::{hash_map::Keys, HashMap},
+    hash::Hash,
+};
+
+use crate::traits::{DataContainer, IndexedDataContainer};
+
+impl<K: std::cmp::Eq + Hash + Clone> DataContainer for UniqueIdentifier<K> {}
+impl<K: std::cmp::Eq + Hash + Clone> IndexedDataContainer for UniqueIdentifier<K> {
+    type Item<'a> = K where K: 'a;
+    type OwnedItem = K;
+
+    fn get_value(&self, index: usize) -> <Self as IndexedDataContainer>::Item<'_> {
+        self.get_key(index).unwrap().clone()
+    }
+
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn get_owned(&self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
+        self.get_value(index).clone()
+    }
+
+    fn is_valid_index(&self, index: usize) -> bool {
+        self.ensure_reverse_map_created();
+        self.reverse_map
+            .borrow()
+            .as_ref()
+            .unwrap()
+            .contains_key(&index)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct UniqueIdentifier<K> {
+    map: HashMap<K, usize>,
+    next_key: usize,
+    reverse_map: RefCell<Option<HashMap<usize, K>>>,
+}
+
+impl<K> Default for UniqueIdentifier<K>
+where
+    K: std::cmp::Eq + Hash + Clone,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<K> UniqueIdentifier<K>
+where
+    K: std::cmp::Eq + Hash + Clone,
+{
+    pub fn new() -> Self {
+        Self {
+            map: HashMap::new(),
+            next_key: 0,
+            reverse_map: RefCell::new(None),
+        }
+    }
+
+    pub fn get_or_insert(&mut self, s: &K) -> usize {
+        match self.map.get(s) {
+            Some(&key) => key,
+            None => {
+                let key = self.next_key;
+                self.map.insert(s.clone(), key);
+                self.next_key += 1;
+                // we need to invalidate the cached reverse map
+                let mut reverse_map_borrow = self.reverse_map.borrow_mut();
+                *reverse_map_borrow = None;
+                key
+            }
+        }
+    }
+
+    fn ensure_reverse_map_created(&self) {
+        let mut reverse_map_borrow = self.reverse_map.borrow_mut();
+        if reverse_map_borrow.is_none() {
+            let reverse_map = self.map.iter().map(|(k, v)| (*v, k.clone())).collect();
+            *reverse_map_borrow = Some(reverse_map);
+        }
+    }
+
+    pub fn get_key(&self, index: usize) -> Option<K> {
+        self.ensure_reverse_map_created();
+        self.reverse_map
+            .borrow()
+            .as_ref()
+            .unwrap()
+            .get(&index)
+            .cloned()
+    }
+
+    pub fn is_valid_index(&self, index: usize) -> bool {
+        self.ensure_reverse_map_created();
+        self.reverse_map
+            .borrow()
+            .as_ref()
+            .unwrap()
+            .contains_key(&index)
+    }
+
+    pub fn keys(&self) -> Keys<'_, K, usize> {
+        self.map.keys()
+    }
+
+    pub fn indices(&self) -> Vec<usize> {
+        self.ensure_reverse_map_created();
+        self.reverse_map
+            .borrow()
+            .as_ref()
+            .unwrap()
+            .keys()
+            .cloned()
+            .collect()
+    }
+
+    pub fn get_index(&self, s: &K) -> Option<usize> {
+        self.map.get(s).cloned()
+    }
+
+    pub fn len(&self) -> usize {
+        self.map.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_insert_and_get_key() {
+        let mut ui = UniqueIdentifier::new();
+        let key = "test_key";
+        let index = ui.get_or_insert(&key);
+
+        assert_eq!(index, 0);
+        assert_eq!(ui.get_key(index), Some(key));
+    }
+
+    #[test]
+    fn test_is_valid_index() {
+        let mut ui = UniqueIdentifier::new();
+        let key = "another_key";
+        ui.get_or_insert(&key);
+
+        assert!(ui.is_valid_index(0));
+        assert!(!ui.is_valid_index(1));
+
+        let key = "another!";
+        let index = ui.get_or_insert(&key);
+        assert_eq!(index, 1);
+        assert!(ui.is_valid_index(index));
+    }
+
+    #[test]
+    fn test_len() {
+        let mut ui = UniqueIdentifier::new();
+        assert_eq!(ui.len(), 0);
+
+        ui.get_or_insert(&"key1");
+        assert_eq!(ui.len(), 1);
+
+        ui.get_or_insert(&"key2");
+        assert_eq!(ui.len(), 2);
+
+        // Inserting an existing key should not increase the length
+        ui.get_or_insert(&"key1");
+        assert_eq!(ui.len(), 2);
+    }
+
+    #[test]
+    fn test_indices() {
+        let mut ui = UniqueIdentifier::new();
+        ui.get_or_insert(&"key1");
+        ui.get_or_insert(&"key2");
+
+        let indices = ui.indices();
+        assert_eq!(indices.len(), 2);
+        assert!(indices.contains(&0));
+        assert!(indices.contains(&1));
+    }
+
+    #[test]
+    fn test_get_index() {
+        let mut ui = UniqueIdentifier::new();
+        let key = "key_to_find";
+        ui.get_or_insert(&key);
+
+        let index = ui.get_index(&key);
+        assert_eq!(index, Some(0));
+
+        let missing_key = "missing_key";
+        assert_eq!(ui.get_index(&missing_key), None);
+    }
+}