Redesign of how joins are stored in LeftGroupedJoin.

- Previously only overlap and data indices were stored in `LeftGroupedJoin`. However, this makes downstream operations that need to compare, merge/flatten, etc the right ranges harder. So, now a `Vec<RangeTuples>` are stored. `RangeTuple` is a new minimal range type with `Option<usize>` index (they fit well in `LeftGroupedJoin`, since this option avoids having to carry indexed/empty types to this level, which makes prevents API clog. - `RangeTuple` and functionality added. - New experimental method `hist-feature` `--reduce` option which forces overlapping right ranges to create a new set of their feature types, e.g. "CDS,exon,5UTR", which allows for exhaustive and exclusive assignment of each basepair in a window to a particular "feature set". This isn't tested yet, hence experimental. - Commented out `IndexedDataContainer` methods that weren't used and just getting in the way. - New `UniqueIdentifier` type which is useful for key to `usize` mapping. Then, implemntations for `UniqueIdentifier` so it is a `IndexedDataContainer`. This allows for much more efficient storage of data in indexed data containers when one value is repeated a lot. Added `GRanges::new_vec_keyed()` and `GRanges::push_range_with_key()` to implement these methods (the former's name may change). - `RangeReduced` type, which stores "reduced" or flattend ranges. This is used in the `--reduce` option of `hist-features`. - `ConditionalMergingIterator` -- without the `Result`. - `reduce_ranges()` method. - Tests are now seeded. There is an incredibly rare edgecase difference with some stochastically-seeded random genomic ranges with `test_against_bedtools_map_multiple`. The results look like: bedtools: `Some(0.9625260632890147)`, granges: `None`. The cause of this bug, and whether it is a granges or bedtools issue is unknown. But it is rare.
vsbuffalo · Mar 5, 2024 · 3b6cd34 · 3b6cd34
1 parent 2e98a14
commit 3b6cd34
Show file tree

Hide file tree

Showing 12 changed files with 848 additions and 155 deletions.
diff --git a/src/commands.rs b/src/commands.rs
@@ -23,6 +23,7 @@ use crate::{
  ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty},
  reporting::{CommandOutput, Report},
  test_utilities::{random_granges, random_granges_mock_bed5},
+ unique_id::UniqueIdentifier,
  Position, PositionOffset,
 };
 
@@ -723,7 +724,7 @@ fn transpose<T>(v: Vec<Vec<T>>) -> Vec<Vec<T>> {
 /// This is useful for a quick exploratory look at feature density
 /// by window, e.g. with
 ///
-/// $ hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \
+/// $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \
 /// hg38_seqlens.tsv --headers | xsv table -d'\t' | less
 ///
 /// The xsv tool (https://github.com/BurntSushi/xsv) is useful for visualizing
@@ -750,7 +751,20 @@ pub struct HistFeatures {
  #[arg(short, long)]
  chop: bool,
 
+ /// Assign each basepair exclusively to a single "feature set" 
+ /// based on what features overlap it, and count the number of 
+ /// these per window. E.g. a region that overlaps "CDS" and "exon"
+ /// will be called a "CDS,exon" feature. Columns are sorted
+ /// in descending order by column sums. Headers will always be
+ /// used in this case.
+ ///
+ /// ⚠️ Warning! This feature is *experimental* and currently does not
+ /// have unit or integration tests.
+ #[arg(short, long)]
+ exclusive: bool,
+
  /// Whether to output the results as a TSV with headers.
+ /// When --classify is set, headers will always be output.
  #[arg(short = 'l', long)]
  headers: bool,
 
@@ -763,71 +777,194 @@ impl HistFeatures {
  pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
  let bedfile = &self.bedfile;
  let genome = read_seqlens(&self.genome)?;
-
  let bed4_iter = Bed4Iterator::new(bedfile)?;
 
- // Split the elements in the iterator by feature into multiple GRanges objects.
- let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> = HashMap::new();
- for result in bed4_iter {
- let range = result?;
- let feature = &range.data.name;
- let vec = records_by_features.entry(feature.to_string()).or_default();
- vec.push(range.into_empty()) // drop the feature, since we're hashing on it
- }
+ if !self.exclusive {
+ // Split the elements in the iterator by feature into multiple GRanges objects.
+ let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> =
+ HashMap::new();
+ for result in bed4_iter {
+ let range = result?;
+ let feature = &range.data.name;
+ let vec = records_by_features.entry(feature.to_string()).or_default();
+ vec.push(range.into_empty()) // drop the feature, since we're hashing on it
+ }
 
- // Load all the *merged* split records into memory as interval trees.
- let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
- for (feature, ranges) in records_by_features.into_iter() {
- // doing a streaming merge of each feature; bookend merge
- let merging_iter = MergingEmptyIterator::new(ranges, 0);
+  // Load all the *merged* split records into memory as interval trees.
+  let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
+  for (feature, ranges) in records_by_features.into_iter() {
+  // doing a streaming merge of each feature; bookend merge
+  let merging_iter = MergingEmptyIterator::new(ranges, 0);
 
- // load into memory and convert to interval trees
- let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;
+  // load into memory and convert to interval trees
+  let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;
 
- assert!(!gr_by_features.contains_key(&feature));
- gr_by_features.insert(feature, gr);
- }
+  assert!(!gr_by_features.contains_key(&feature));
+  gr_by_features.insert(feature, gr);
+  }
 
- // Now, create windows.
- let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
-
- let features: Vec<_> = gr_by_features.keys().cloned().collect();
- let mut feature_matrix = Vec::new();
- for (_feature, gr) in gr_by_features.into_iter() {
- // find the features that overlap each window
- // TODO/OPTIMIZE: how costly is this clone?
- let window_counts = windows.clone()
- .left_overlaps(&gr)?
- .map_joins(|joins| {
- // these are merged, so this is the number of *unique* basepairs
- let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
- total_overlaps
- })?
- .take_data()?;
- feature_matrix.push(window_counts);
- }
+ // Now, create windows.
+ let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+
+ let features: Vec<_> = gr_by_features.keys().cloned().collect();
+ let mut feature_matrix = Vec::new();
+ for (_feature, gr) in gr_by_features.into_iter() {
+ // find the features that overlap each window
+ // TODO/OPTIMIZE: how costly is this clone?
+ let window_counts = windows
+ .clone()
+ .left_overlaps(&gr)?
+ .map_joins(|joins| {
+ // these are merged, so this is the number of *unique* basepairs
+ let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
+ total_overlaps
+ })?
+ .take_data()?;
+ feature_matrix.push(window_counts);
+ }
 
- let feature_matrix = transpose(feature_matrix);
+  let feature_matrix = transpose(feature_matrix);
 
- // Unite the windows with the feature matrix.
- let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
- let windows = windows.into_granges_data(feature_matrix)?;
+  // Unite the windows with the feature matrix.
+  let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+  let windows = windows.into_granges_data(feature_matrix)?;
 
- // Write everything.
- if !self.headers {
- // we have to *manually* add headers (serde needs flat structs otherwise)
- windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
+ // Write everything.
+ if !self.headers {
+ // we have to *manually* add headers (serde needs flat structs otherwise)
+ windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
+ } else {
+ let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
+ headers.extend(features);
+
+ let config = TsvConfig {
+ no_value_string: "NA".to_string(),
+ headers: Some(headers),
+ };
+ windows.write_to_tsv(self.output.as_ref(), &config)?;
+ }
  } else {
+ // This is done a bit differently than the non-classify case
+
+ // Create GRanges of the feature indices.
+ // We do this manually to avoid creating a needless data container.
+ let mut gr = GRanges::new_vec_keyed(&genome);
+ for result in bed4_iter {
+ let range = result?;
+
+ // Note the trick here: we are *re-using* indices.
+ gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?;
+ }
+ let gr = gr.into_coitrees()?;
+
+ // clone the feature map
+ let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone();
+
+ // Create windows.
+ let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+
+ // "Reduce" ranges and
+ let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| {
+ // "reduce" the ranges to a minimum spanning set that contains a vec of indices
+ let ranges = join.join.reduce_ranges();
+ let mut feature_overlaps: HashMap<Vec<usize>, _> = HashMap::new();
+ for range in ranges.iter() {
+ let mut key: Vec<usize> = range.indices().iter().map(|x| x.unwrap()).collect();
+ key.sort(); // sorted key is compound key
+ *feature_overlaps.entry(key).or_insert(0) += range.width();
+ }
+
+ feature_overlaps
+ })?;
+
+ // Now, we go over the data and get the observed sets
+ let mut observed_sets = UniqueIdentifier::new();
+ let data = windows_overlaps
+ .data()
+ .ok_or(GRangesError::NoDataContainer)?;
+ for feature_counts in data.iter() {
+ for feature_set in feature_counts.keys() {
+ observed_sets.get_or_insert(feature_set);
+ }
+ }
+ let nsets = observed_sets.len(); // total feature sets
+
+ // Go over data again, making the sparse hashmap into a full
+ // matrix of overlappng basepairs.
+ let gr = windows_overlaps.map_data(|feature_counts| {
+ // fill out matrix
+ let mut counts = vec![0; nsets];
+ for (i, feature_set) in observed_sets.keys().enumerate() {
+ counts[i] = *feature_counts.get(feature_set).unwrap_or(&0);
+ }
+ counts
+ })?;
+
+ // To make this prettier, let's sort by column totals (this
+ // is a little pricey; we can make option later if needed).
+ let matrix = gr.data().ok_or(GRangesError::NoDataContainer)?;
+ let col_totals = column_totals(matrix);
+ let sorted_indices = sorted_indices_by_values(&col_totals);
+
+ let gr = gr.map_data(|row| {
+ let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect();
+ row
+ })?;
+
+ // Create the labels.
+ let feature_sets: Vec<String> = observed_sets
+ .keys()
+ .map(|indices_key| {
+ let labels: Vec<_> = indices_key
+ .iter()
+ .map(|idx| feature_map.get_key(*idx).unwrap().clone())
+ .collect();
+ labels.join(",")
+ })
+ .collect();
+
+ // Re-order the headers too by the sorted indices.
+ let feature_sets: Vec<_> = sorted_indices.iter()
+ .map(|i| feature_sets[*i].clone()).collect();
+
  let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
- headers.extend(features);
+ headers.extend(feature_sets);
 
  let config = TsvConfig {
  no_value_string: "NA".to_string(),
  headers: Some(headers),
  };
- windows.write_to_tsv(self.output.as_ref(), &config)?;
+ gr.write_to_tsv(self.output.as_ref(), &config)?;
  }
-
  Ok(CommandOutput::new((), None))
  }
 }
+
+
+// get column totals 
+fn column_totals(matrix: &Vec<Vec<Position>>) -> Vec<Position> {
+ if matrix.is_empty() || matrix[0].is_empty() {
+ return Vec::new();
+ }
+
+ let num_columns = matrix[0].len();
+ let mut totals = vec![0; num_columns];
+
+ for row in matrix {
+ for (i, &value) in row.iter().enumerate() {
+ if i < totals.len() {
+ totals[i] += value;
+ }
+ }
+ }
+
+ totals
+} 
+
+// get descending indices order
+fn sorted_indices_by_values(values: &[Position]) -> Vec<usize> {
+ let mut indices: Vec<usize> = (0..values.len()).collect();
+ indices.sort_by_key(|&i| std::cmp::Reverse(values[i]));
+ indices
+}
+
diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs
@@ -146,7 +146,7 @@ where
 {
  type Item<'a> = U;
  type OwnedItem = U;
- type Output = Array1<U>;
+ // type Output = Array1<U>;
 
  fn get_value(&self, index: usize) -> Self::Item<'_> {
  self[index]
@@ -165,9 +165,9 @@ where
  index < self.shape()[0]
  }
 
- fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
-   Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
- }
+ // fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+ // Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
+ // }
 }
 
 impl<U> IndexedDataContainer for Array2<U>
@@ -176,7 +176,7 @@ where
 {
  type Item<'a> = ArrayView1<'a, U>;
  type OwnedItem = Array1<U>;
- type Output = Array2<U>;
+ // type Output = Array2<U>;
 
  fn get_value(&self, index: usize) -> Self::Item<'_> {
  self.row(index)
@@ -194,19 +194,19 @@ where
  index < self.shape()[0]
  }
 
- fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
- let cols = self.shape()[1];
+ //fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+ // let cols = self.shape()[1];
 
- let rows_data: Vec<U> = indices
- .iter()
- .flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
- .collect();
+ // let rows_data: Vec<U> = indices
+ // .iter()
+ // .flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
+ // .collect();
 
- // create a new Array2<U> from the rows
- // shape is (number of indices, number of columns)
- Array2::from_shape_vec((indices.len(), cols), rows_data)
- .expect("Shape and collected data size mismatch")
- }
+ // // create a new Array2<U> from the rows
+ // // shape is (number of indices, number of columns)
+ // Array2::from_shape_vec((indices.len(), cols), rows_data)
+ // .expect("Shape and collected data size mismatch")
+ //}
 }
 
 #[cfg(test)]

diff --git a/src/data/vec.rs b/src/data/vec.rs
@@ -11,7 +11,7 @@ where
 {
  type Item<'a> = &'a U where Self: 'a;
  type OwnedItem = U;
- type Output = Vec<U>;
+ // type Output = Vec<U>;
 
  fn get_value(&self, index: usize) -> Self::Item<'_> {
  self.get(index).unwrap()
@@ -29,7 +29,7 @@ where
  self.get(index).is_some()
  }
 
- fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
- Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
- }
+ // fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+ // Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
+ //}
 }