From e0b9c3b57cc04847358161beb93316dfc374b403 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Mon, 4 Mar 2024 23:46:23 -0800 Subject: [PATCH] Redesign of how joins are stored in `LeftGroupedJoin`. - Previously only overlap and data indices were stored in `LeftGroupedJoin`. However, this makes downstream operations that need to compare, merge/flatten, etc the right ranges harder. So, now a `Vec` are stored. `RangeTuple` is a new minimal range type with `Option` index (they fit well in `LeftGroupedJoin`, since this option avoids having to carry indexed/empty types to this level, which makes prevents API clog. - `RangeTuple` and functionality added. - New experimental method `hist-feature` `--reduce` option which forces overlapping right ranges to create a new set of their feature types, e.g. "CDS,exon,5UTR", which allows for exhaustive and exclusive assignment of each basepair in a window to a particular "feature set". This isn't tested yet, hence experimental. - Commented out `IndexedDataContainer` methods that weren't used and just getting in the way. - New `UniqueIdentifier` type which is useful for key to `usize` mapping. Then, implemntations for `UniqueIdentifier` so it is a `IndexedDataContainer`. This allows for much more efficient storage of data in indexed data containers when one value is repeated a lot. Added `GRanges::new_vec_keyed()` and `GRanges::push_range_with_key()` to implement these methods (the former's name may change). - `RangeReduced` type, which stores "reduced" or flattend ranges. This is used in the `--reduce` option of `hist-features`. - `ConditionalMergingIterator` -- without the `Result`. - `reduce_ranges()` method. --- src/commands.rs | 235 ++++++++++++++++++++++++------- src/data/ndarray.rs | 32 ++--- src/data/vec.rs | 8 +- src/granges.rs | 53 ++++++- src/join.rs | 289 +++++++++++++++++++++++++++++++-------- src/lib.rs | 5 +- src/main/mod.rs | 24 ++-- src/merging_iterators.rs | 113 +++++++++++++++ src/traits.rs | 12 +- src/unique_id.rs | 210 ++++++++++++++++++++++++++++ 10 files changed, 835 insertions(+), 146 deletions(-) create mode 100644 src/unique_id.rs diff --git a/src/commands.rs b/src/commands.rs index 8b9c6d9..f34c634 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -23,6 +23,7 @@ use crate::{ ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty}, reporting::{CommandOutput, Report}, test_utilities::{random_granges, random_granges_mock_bed5}, + unique_id::UniqueIdentifier, Position, PositionOffset, }; @@ -723,7 +724,7 @@ fn transpose(v: Vec>) -> Vec> { /// This is useful for a quick exploratory look at feature density /// by window, e.g. with /// -/// $ hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \ +/// $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \ /// hg38_seqlens.tsv --headers | xsv table -d'\t' | less /// /// The xsv tool (https://github.com/BurntSushi/xsv) is useful for visualizing @@ -750,7 +751,20 @@ pub struct HistFeatures { #[arg(short, long)] chop: bool, + /// Assign each basepair exclusively to a single "feature set" + /// based on what features overlap it, and count the number of + /// these per window. E.g. a region that overlaps "CDS" and "exon" + /// will be called a "CDS,exon" feature. Columns are sorted + /// in descending order by column sums. Headers will always be + /// used in this case. + /// + /// ⚠️ Warning! This feature is *experimental* and currently does not + /// have unit or integration tests. + #[arg(short, long)] + exclusive: bool, + /// Whether to output the results as a TSV with headers. + /// When --classify is set, headers will always be output. #[arg(short = 'l', long)] headers: bool, @@ -763,71 +777,194 @@ impl HistFeatures { pub fn run(&self) -> Result, GRangesError> { let bedfile = &self.bedfile; let genome = read_seqlens(&self.genome)?; - let bed4_iter = Bed4Iterator::new(bedfile)?; - // Split the elements in the iterator by feature into multiple GRanges objects. - let mut records_by_features: HashMap> = HashMap::new(); - for result in bed4_iter { - let range = result?; - let feature = &range.data.name; - let vec = records_by_features.entry(feature.to_string()).or_default(); - vec.push(range.into_empty()) // drop the feature, since we're hashing on it - } + if !self.exclusive { + // Split the elements in the iterator by feature into multiple GRanges objects. + let mut records_by_features: HashMap> = + HashMap::new(); + for result in bed4_iter { + let range = result?; + let feature = &range.data.name; + let vec = records_by_features.entry(feature.to_string()).or_default(); + vec.push(range.into_empty()) // drop the feature, since we're hashing on it + } - // Load all the *merged* split records into memory as interval trees. - let mut gr_by_features: HashMap> = HashMap::new(); - for (feature, ranges) in records_by_features.into_iter() { - // doing a streaming merge of each feature; bookend merge - let merging_iter = MergingEmptyIterator::new(ranges, 0); + // Load all the *merged* split records into memory as interval trees. + let mut gr_by_features: HashMap> = HashMap::new(); + for (feature, ranges) in records_by_features.into_iter() { + // doing a streaming merge of each feature; bookend merge + let merging_iter = MergingEmptyIterator::new(ranges, 0); - // load into memory and convert to interval trees - let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?; + // load into memory and convert to interval trees + let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?; - assert!(!gr_by_features.contains_key(&feature)); - gr_by_features.insert(feature, gr); - } + assert!(!gr_by_features.contains_key(&feature)); + gr_by_features.insert(feature, gr); + } - // Now, create windows. - let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; - - let features: Vec<_> = gr_by_features.keys().cloned().collect(); - let mut feature_matrix = Vec::new(); - for (_feature, gr) in gr_by_features.into_iter() { - // find the features that overlap each window - // TODO/OPTIMIZE: how costly is this clone? - let window_counts = windows.clone() - .left_overlaps(&gr)? - .map_joins(|joins| { - // these are merged, so this is the number of *unique* basepairs - let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum(); - total_overlaps - })? - .take_data()?; - feature_matrix.push(window_counts); - } + // Now, create windows. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + + let features: Vec<_> = gr_by_features.keys().cloned().collect(); + let mut feature_matrix = Vec::new(); + for (_feature, gr) in gr_by_features.into_iter() { + // find the features that overlap each window + // TODO/OPTIMIZE: how costly is this clone? + let window_counts = windows + .clone() + .left_overlaps(&gr)? + .map_joins(|joins| { + // these are merged, so this is the number of *unique* basepairs + let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum(); + total_overlaps + })? + .take_data()?; + feature_matrix.push(window_counts); + } - let feature_matrix = transpose(feature_matrix); + let feature_matrix = transpose(feature_matrix); - // Unite the windows with the feature matrix. - let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; - let windows = windows.into_granges_data(feature_matrix)?; + // Unite the windows with the feature matrix. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + let windows = windows.into_granges_data(feature_matrix)?; - // Write everything. - if !self.headers { - // we have to *manually* add headers (serde needs flat structs otherwise) - windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?; + // Write everything. + if !self.headers { + // we have to *manually* add headers (serde needs flat structs otherwise) + windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?; + } else { + let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()]; + headers.extend(features); + + let config = TsvConfig { + no_value_string: "NA".to_string(), + headers: Some(headers), + }; + windows.write_to_tsv(self.output.as_ref(), &config)?; + } } else { + // This is done a bit differently than the non-classify case + + // Create GRanges of the feature indices. + // We do this manually to avoid creating a needless data container. + let mut gr = GRanges::new_vec_keyed(&genome); + for result in bed4_iter { + let range = result?; + + // Note the trick here: we are *re-using* indices. + gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?; + } + let gr = gr.into_coitrees()?; + + // clone the feature map + let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone(); + + // Create windows. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + + // "Reduce" ranges and + let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| { + // "reduce" the ranges to a minimum spanning set that contains a vec of indices + let ranges = join.join.reduce_ranges(); + let mut feature_overlaps: HashMap, _> = HashMap::new(); + for range in ranges.iter() { + let mut key: Vec = range.indices().iter().map(|x| x.unwrap()).collect(); + key.sort(); // sorted key is compound key + *feature_overlaps.entry(key).or_insert(0) += range.width(); + } + + feature_overlaps + })?; + + // Now, we go over the data and get the observed sets + let mut observed_sets = UniqueIdentifier::new(); + let data = windows_overlaps + .data() + .ok_or(GRangesError::NoDataContainer)?; + for feature_counts in data.iter() { + for feature_set in feature_counts.keys() { + observed_sets.get_or_insert(feature_set); + } + } + let nsets = observed_sets.len(); // total feature sets + + // Go over data again, making the sparse hashmap into a full + // matrix of overlappng basepairs. + let gr = windows_overlaps.map_data(|feature_counts| { + // fill out matrix + let mut counts = vec![0; nsets]; + for (i, feature_set) in observed_sets.keys().enumerate() { + counts[i] = *feature_counts.get(feature_set).unwrap_or(&0); + } + counts + })?; + + // To make this prettier, let's sort by column totals (this + // is a little pricey; we can make option later if needed). + let matrix = gr.data().ok_or(GRangesError::NoDataContainer)?; + let col_totals = column_totals(matrix); + let sorted_indices = sorted_indices_by_values(&col_totals); + + let gr = gr.map_data(|row| { + let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect(); + row + })?; + + // Create the labels. + let feature_sets: Vec = observed_sets + .keys() + .map(|indices_key| { + let labels: Vec<_> = indices_key + .iter() + .map(|idx| feature_map.get_key(*idx).unwrap().clone()) + .collect(); + labels.join(",") + }) + .collect(); + + // Re-order the headers too by the sorted indices. + let feature_sets: Vec<_> = sorted_indices.iter() + .map(|i| feature_sets[*i].clone()).collect(); + let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()]; - headers.extend(features); + headers.extend(feature_sets); let config = TsvConfig { no_value_string: "NA".to_string(), headers: Some(headers), }; - windows.write_to_tsv(self.output.as_ref(), &config)?; + gr.write_to_tsv(self.output.as_ref(), &config)?; } - Ok(CommandOutput::new((), None)) } } + + +// get column totals +fn column_totals(matrix: &Vec>) -> Vec { + if matrix.is_empty() || matrix[0].is_empty() { + return Vec::new(); + } + + let num_columns = matrix[0].len(); + let mut totals = vec![0; num_columns]; + + for row in matrix { + for (i, &value) in row.iter().enumerate() { + if i < totals.len() { + totals[i] += value; + } + } + } + + totals +} + +// get descending indices order +fn sorted_indices_by_values(values: &[Position]) -> Vec { + let mut indices: Vec = (0..values.len()).collect(); + indices.sort_by_key(|&i| std::cmp::Reverse(values[i])); + indices +} + diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs index 1d52436..43050b6 100644 --- a/src/data/ndarray.rs +++ b/src/data/ndarray.rs @@ -146,7 +146,7 @@ where { type Item<'a> = U; type OwnedItem = U; - type Output = Array1; + // type Output = Array1; fn get_value(&self, index: usize) -> Self::Item<'_> { self[index] @@ -165,9 +165,9 @@ where index < self.shape()[0] } - fn new_from_indices(&self, indices: &[usize]) -> Self::Output { - Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx))) - } + // fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + // Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx))) + // } } impl IndexedDataContainer for Array2 @@ -176,7 +176,7 @@ where { type Item<'a> = ArrayView1<'a, U>; type OwnedItem = Array1; - type Output = Array2; + // type Output = Array2; fn get_value(&self, index: usize) -> Self::Item<'_> { self.row(index) @@ -194,19 +194,19 @@ where index < self.shape()[0] } - fn new_from_indices(&self, indices: &[usize]) -> Self::Output { - let cols = self.shape()[1]; + //fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + // let cols = self.shape()[1]; - let rows_data: Vec = indices - .iter() - .flat_map(|&idx| self.row(idx).iter().cloned().collect::>()) - .collect(); + // let rows_data: Vec = indices + // .iter() + // .flat_map(|&idx| self.row(idx).iter().cloned().collect::>()) + // .collect(); - // create a new Array2 from the rows - // shape is (number of indices, number of columns) - Array2::from_shape_vec((indices.len(), cols), rows_data) - .expect("Shape and collected data size mismatch") - } + // // create a new Array2 from the rows + // // shape is (number of indices, number of columns) + // Array2::from_shape_vec((indices.len(), cols), rows_data) + // .expect("Shape and collected data size mismatch") + //} } #[cfg(test)] diff --git a/src/data/vec.rs b/src/data/vec.rs index 42ccd79..52256c5 100644 --- a/src/data/vec.rs +++ b/src/data/vec.rs @@ -11,7 +11,7 @@ where { type Item<'a> = &'a U where Self: 'a; type OwnedItem = U; - type Output = Vec; + // type Output = Vec; fn get_value(&self, index: usize) -> Self::Item<'_> { self.get(index).unwrap() @@ -29,7 +29,7 @@ where self.get(index).is_some() } - fn new_from_indices(&self, indices: &[usize]) -> Self::Output { - Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone())) - } + // fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + // Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone())) + //} } diff --git a/src/granges.rs b/src/granges.rs index cc1fbaf..62e2b8c 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -37,7 +37,7 @@ //! [`BedlikeIterator`]: crate::io::parsers::BedlikeIterator //! [`GRanges::into_coitrees`]: crate::granges::GRanges::into_coitrees -use std::{collections::HashSet, path::PathBuf}; +use std::{collections::HashSet, hash::Hash, path::PathBuf}; use genomap::GenomeMap; use indexmap::IndexMap; @@ -64,6 +64,7 @@ use crate::{ GenomicRangesTsvSerialize, IndexedDataContainer, IterableRangeContainer, LeftOverlaps, RangeContainer, }, + unique_id::UniqueIdentifier, Position, PositionOffset, }; @@ -305,6 +306,48 @@ impl GRanges, T> { } } +impl GRanges> { + /// Create a new [`GRanges`] object, with a [`UniqueIdentifier`] as a + /// data container. + pub fn new_vec_keyed(seqlens: &IndexMap) -> Self { + let mut ranges = GenomeMap::new(); + for (seqname, length) in seqlens.iter() { + // this should never happen because the error is only if + // insert encounters a seqname that's already been inserted -- that + // cannot happen here. + ranges + .insert(seqname, VecRanges::new(*length)) + .expect("Internal error: please report"); + } + Self { ranges, data: None } + } + pub fn push_range_with_key( + &mut self, + seqname: &str, + start: Position, + end: Position, + key: &K, + ) -> Result<(), GRangesError> { + let data_ref = match self.data { + Some(ref mut data) => data, + None => { + self.data = Some(UniqueIdentifier::new()); + self.data.as_mut().unwrap() // Safe unwrap because we just inserted a value + } + }; + + let index = data_ref.get_or_insert(key); + let range = RangeIndexed::new(start, end, index); + + let range_container = self + .ranges + .get_mut(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + range_container.push_range(range); + Ok(()) + } +} + impl GRanges, T> { /// Adjust all the ranges in this [`GRanges`] object in place. pub fn adjust_ranges(mut self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self { @@ -805,7 +848,7 @@ where if let Some(right_ranges) = right.ranges.get(seqname) { right_ranges.query(left_range.start(), left_range.end(), |right_range| { // NOTE: right_range is a coitrees::IntervalNode. - join_data.add_right(&left_range, right_range); + join_data.add_right(right_range); }); } gr.push_range_with_join(seqname, left_range.start, left_range.end, join_data)?; @@ -846,7 +889,7 @@ where if let Some(right_ranges) = right.0.ranges.get(seqname) { right_ranges.query(left_range.start(), left_range.end(), |right_range| { // NOTE: right_range is a coitrees::IntervalNode. - join_data.add_right(&left_range, right_range); + join_data.add_right(right_range); }); } gr.push_range_with_join(seqname, left_range.start, left_range.end, join_data)?; @@ -898,7 +941,7 @@ where if let Some(right_ranges) = right.ranges.get(seqname) { right_ranges.query(left_range.start(), left_range.end(), |right_range| { // NOTE: right_range is a coitrees::IntervalNode. - join_data.add_right(&left_range, right_range); + join_data.add_right(right_range); }); } gr.push_range_with_join(seqname, left_range.start(), left_range.end(), join_data)?; @@ -949,7 +992,7 @@ where if let Some(right_ranges) = right.0.ranges.get(seqname) { right_ranges.query(left_range.start(), left_range.end(), |right_range| { // NOTE: right_range is a coitrees::IntervalNode. - join_data.add_right(&left_range, right_range); + join_data.add_right(right_range); }); } gr.push_range_with_join(seqname, left_range.start(), left_range.end(), join_data)?; diff --git a/src/join.rs b/src/join.rs index 44f08fe..a40f3d2 100644 --- a/src/join.rs +++ b/src/join.rs @@ -2,80 +2,174 @@ //! #![allow(clippy::all)] +use std::collections::HashSet; + use crate::{ traits::{GenericRange, IndexedDataContainer, JoinDataOperations}, Position, }; -/// [`LeftGroupedJoin`] contains information about the right ranges -/// and their degree of overlap with a focal left range. This information -/// is designed to facilitate downstream statistical sumamries of the -/// corresponding data in overlapping ranges. +/// This is a generic range used just in join logic, to avoid +/// having to handle bringing range types into [`LeftGroupedJoin`], +/// which clog up the API a bit. +/// These can represent indexed and empty ranges. #[derive(Clone, Debug, PartialEq)] -pub struct LeftGroupedJoin { - /// The data index for the left range. - left: Option, +pub struct RangeTuple((Position, Position, Option)); - /// A `Vec` of the indices for the overlapping right ranges. - /// This is `None` if the right ranges do not have a data container. - rights: Option>, +impl GenericRange for RangeTuple { + fn start(&self) -> Position { + self.0 .0 + } + fn end(&self) -> Position { + self.0 .1 + } + fn index(&self) -> Option { + self.0 .2 + } +} - /// The length of the left range. - left_length: Position, +/// This is a special "reduced" range that stores indices +/// to multiple data elements. +#[derive(Clone, Debug, PartialEq)] +pub struct RangeReduced((Position, Position, HashSet>)); - /// The lengths of the right ranges. - right_lengths: Vec, +impl RangeReduced { + pub fn indices(&self) -> &HashSet> { + &self.0 .2 + } +} - /// The lengths of the overlaps between the left and right ranges. - overlaps: Vec, - // TODO: we may want some simple summary of whether an overlapping range is - // up or downstream. I think the cleanest summary is a signed integer - // representing the side and degree of non-overlap. E.g. a range - // that overlaps another but overhangs the 3' side of the focal left - // range by 10bp is +10; if it were 5', it would be -10. +/// Create a vector of "reduced" or flattened ranges, that stack the indices of each range. +/// +/// +/// This first finds every range position, sorts, and dedups these. Then, +/// it iterates through each range of adjacent positions. Each of these +/// new ranges is guaranteed to be covered by >= 1 range. +pub fn reduce_ranges(ranges: &Vec) -> Vec { + let mut range_ends: Vec = ranges + .iter() + .flat_map(|range| vec![range.start(), range.end()].into_iter()) + .collect(); + range_ends.sort_unstable(); + range_ends.dedup(); + + let mut ranges_reduced = Vec::new(); + for range in range_ends.windows(2) { + let mut indices = HashSet::new(); + if let [start, end] = range { + for range in ranges { + if range.start() < *end && range.end() > *start { + indices.insert(range.index()); + } + } + if !indices.is_empty() { + ranges_reduced.push(RangeReduced((*start, *end, indices))); + } + } + } + ranges_reduced +} - // left_data: Option<&'a DL>, - // right_data: Option<&'a DR>, +impl GenericRange for RangeReduced { + fn start(&self) -> Position { + self.0 .0 + } + fn end(&self) -> Position { + self.0 .1 + } + // Note: [`RangeReduced`] do not have valid indices, + // so this returns [`None`]. (They do have a [`Vec>`] + // of indices, but this has to be accessed through the type.) + fn index(&self) -> Option { + None + } +} + +/// [`LeftGroupedJoin`] contains information about the right ranges +/// and their degree of overlap with a focal left range. This information +/// is designed to facilitate downstream statistical sumamries of the +/// corresponding data in overlapping ranges. +#[derive(Clone, Debug, PartialEq)] +pub struct LeftGroupedJoin { + /// The left range. + pub left: RangeTuple, + + /// A `Vec` of the right overlapping ranges (unsorted). + // NOTE: previously lengths, overlap width, and their data + // indices were stored. For just one extra u32 we can store + // all the data in the original structure. + pub rights: Vec, } impl LeftGroupedJoin { /// Create a new [`LeftGroupedJoin`]. pub fn new(left_range: &R) -> Self { Self { - left: left_range.index(), - rights: None, - left_length: left_range.width(), - right_lengths: Vec::new(), - overlaps: Vec::new(), - // left_data, - // right_data, + left: RangeTuple(left_range.as_tuple()), + rights: Vec::new(), } } /// Add a right (overlapping) range to this [`LeftGroupedJoin`]. /// // Note: in principle, this can be called on *non-overlapping* right ranges too, // for a full-outer join. - pub fn add_right(&mut self, left: &R, right: &Q) { - if let Some(right_index) = right.index() { - // the right range has data -- add to vec, initializing if not there - self.rights.get_or_insert_with(Vec::new).push(right_index) - } - self.right_lengths.push(right.width()); - self.overlaps.push(left.overlap_width(right)); + pub fn add_right(&mut self, right: &R) { + self.rights.push(RangeTuple(right.as_tuple())) + } + + /// Sort the right ranges, for faster downstream processing. + pub fn sort_ranges(&mut self) { + self.rights.sort_by(|a, b| { + a.start() + .cmp(&b.start()) + .then_with(|| a.end().cmp(&b.end())) + .then_with(|| a.index().cmp(&b.index())) + }); + } + + /// "Reduce" the ranges into a minimum set, with all + /// their indices gathered in a [`Vec>`]. + /// This returns a [`Vec`]. + pub fn reduce_ranges(&mut self) -> Vec { + // we need to trim these by the left range to get the + // proper overlaps within this left range + let rights: Vec<_> = self + .rights + .iter() + .map(|range| { + let (start, end) = range.overlap_range(&self.left).unwrap(); + RangeTuple((start, end, range.index())) + }) + .collect(); + reduce_ranges(&rights) } + /// Return whether this left range has any [`LeftGroupedJoin`]. pub fn has_overlaps(&self) -> bool { - !self.overlaps.is_empty() + !self.overlaps().is_empty() } /// Retrieve the number of right overlaps. pub fn num_overlaps(&self) -> usize { - self.overlaps.len() + self.overlaps().len() } /// Retrieve the right overlaps. - pub fn overlaps(&self) -> &Vec { - &self.overlaps + pub fn overlaps(&self) -> Vec { + self.rights + .iter() + .map(|r| r.overlap_width(&self.left)) + .collect() + } + + /// Get the left index. + pub fn left_index(&self) -> Option { + self.left.index() + } + + /// Get the right indices. + pub fn right_indices(&self) -> Vec> { + self.rights.iter().map(|r| r.index()).collect() } } @@ -164,13 +258,11 @@ where self.joins .into_iter() .map(|join| { - let left_data = self.left_data.get_owned(join.left.unwrap()); + let left_data = self.left_data.get_owned(join.left_index().unwrap()); let right_data = join - .rights - .as_ref() - .unwrap() + .right_indices() .iter() - .map(|idx| self.right_data.get_owned(*idx)) + .map(|idx| self.right_data.get_owned(idx.unwrap())) .collect(); func(CombinedJoinData { @@ -257,13 +349,11 @@ where self.joins .into_iter() .map(|join| { - let right_indices = join.rights.as_ref(); - let right_data = right_indices.map_or(Vec::new(), |indices| { - indices - .iter() - .map(|idx| self.right_data.get_owned(*idx)) - .collect() - }); + let right_indices = join.right_indices(); + let right_data = right_indices + .iter() + .map(|idx| self.right_data.get_owned(idx.unwrap())) + .collect(); func(CombinedJoinDataLeftEmpty { join, right_data }) }) @@ -343,7 +433,7 @@ where self.joins .into_iter() .map(|join| { - let left_data = self.left_data.get_owned(join.left.unwrap()); + let left_data = self.left_data.get_owned(join.left_index().unwrap()); func(CombinedJoinDataRightEmpty { join, left_data }) }) @@ -443,10 +533,9 @@ impl<'a, DL, DR> Iterator for JoinDataIterator<'a, DL, DR> { #[cfg(test)] mod tests { + use super::*; use crate::ranges::RangeIndexed; - use super::{JoinData, LeftGroupedJoin}; - #[test] fn test_join_data_new() { let left_data = vec![1, 2]; @@ -457,8 +546,94 @@ mod tests { let left = RangeIndexed::new(0, 10, 1); let mut join = LeftGroupedJoin::new(&left); let right = RangeIndexed::new(8, 10, 1); - join.add_right(&left, &right); + join.add_right(&right); jd.push(join); assert_eq!(jd.len(), 1); } + + #[test] + fn test_single_range_indexed() { + let ranges = vec![RangeIndexed { + start: 1, + end: 5, + index: 0, + }]; + let reduced = reduce_ranges(&ranges); + + assert_eq!(reduced.len(), 1); + assert_eq!(reduced[0].0 .0, 1); + assert_eq!(reduced[0].0 .1, 5); + assert!(reduced[0].0 .2.contains(&Some(0))); + } + + #[test] + fn test_overlapping_ranges_indexed() { + let ranges = vec![ + RangeIndexed { + start: 1, + end: 4, + index: 0, + }, + RangeIndexed { + start: 3, + end: 6, + index: 1, + }, + ]; + let reduced = reduce_ranges(&ranges); + + assert_eq!(reduced.len(), 3); + + let mut iter = reduced.iter(); + let first_range = iter.next().unwrap(); + assert_eq!(first_range.start(), 1); + assert_eq!(first_range.end(), 3); + assert_eq!(first_range.indices().len(), 1); + assert!(first_range.indices().contains(&Some(0))); + + let second_range = iter.next().unwrap(); + assert_eq!(second_range.start(), 3); + assert_eq!(second_range.end(), 4); + assert_eq!(second_range.indices().len(), 2); + assert!(second_range.indices().contains(&Some(0))); + assert!(second_range.indices().contains(&Some(1))); + + let third_range = iter.next().unwrap(); + assert_eq!(third_range.start(), 4); + assert_eq!(third_range.end(), 6); + assert_eq!(third_range.indices().len(), 1); + assert!(third_range.indices().contains(&Some(1))); + } + + #[test] + fn test_non_overlapping_ranges_indexed() { + let ranges = vec![ + RangeIndexed { + start: 1, + end: 3, + index: 0, + }, + RangeIndexed { + start: 4, + end: 6, + index: 1, + }, + ]; + let reduced = reduce_ranges(&ranges); + + assert_eq!(reduced.len(), 2); + + let mut iter = reduced.iter(); + let first_range = iter.next().unwrap(); + assert_eq!(first_range.start(), 1); + assert_eq!(first_range.end(), 3); + assert_eq!(first_range.indices().len(), 1); + assert!(first_range.indices().contains(&Some(0))); + + let second_range = iter.next().unwrap(); + assert_eq!(second_range.start(), 4); + assert_eq!(second_range.end(), 6); + assert_eq!(second_range.indices().len(), 1); + assert!(second_range.indices().contains(&Some(1))); + } } diff --git a/src/lib.rs b/src/lib.rs index 28d4f77..b09d163 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -364,6 +364,7 @@ pub mod ranges; pub mod sequences; pub mod test_utilities; pub mod traits; +pub mod unique_id; // bringing these CLI modules into lib.rs rather than main/ allows for // use in integration tests and other Rust-side command line work @@ -425,8 +426,8 @@ pub mod prelude { }; pub use crate::merging_iterators::{ - ConditionalMergingResultIterator, MergingEmptyIterator, MergingEmptyResultIterator, - MergingResultIterator, + ConditionalMergingIterator, ConditionalMergingResultIterator, MergingEmptyIterator, + MergingEmptyResultIterator, MergingResultIterator, }; pub use crate::traits::{ AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, diff --git a/src/main/mod.rs b/src/main/mod.rs index 243a3e7..70c6437 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -20,20 +20,24 @@ usage: granges [--help] Subcommands: - adjust: Adjust each genomic range, e.g. to add a kilobase to each end. + adjust: Adjust each genomic range, e.g. to add a kilobase to each end. - filter: Filter the left ranges based on whether they have at least one - overlap with a right range. This is equivalent to a filtering - "semi-join" in SQL terminology. + filter: Filter the left ranges based on whether they have at least one + overlap with a right range. This is equivalent to a filtering + "semi-join" in SQL terminology. - map: Compute the left grouped overlaps between the left genomic ranges - and right genomic ranges, and apply one or more operations to the - score column of the right BED5 file. + hist-features: Histogram feature coverage per windows. This merges overlapping + ranges first. - merge: Merge ranges that are within a minimum distance of each other. + map: Compute the left grouped overlaps between the left genomic ranges + and right genomic ranges, and apply one or more operations to the + score column of the right BED5 file. + + merge: Merge ranges that are within a minimum distance of each other. - windows: Create a set of genomic windows of the specified width (in basepairs), - stepping the specified step size (the width, by default). + windows: Create a set of genomic windows of the specified width (in + basepairs), stepping the specified step size (the width, by + default). NOTE: granges is under active development. It is not currently meant to be diff --git a/src/merging_iterators.rs b/src/merging_iterators.rs index 7b2d939..308a566 100644 --- a/src/merging_iterators.rs +++ b/src/merging_iterators.rs @@ -240,6 +240,119 @@ where } } +/// A merging iterator (over [`GenomicRangeRecord] items, +/// i.e. from a parsing iterator) that has an additional merge condition tested +/// by function `G` based on the items. +pub struct ConditionalMergingIterator +where + I: IntoIterator>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, + G: Fn(&GenomicRangeRecord, &GenomicRangeRecord) -> bool, +{ + last_range: Option>, + inner: ::IntoIter, + minimum_distance: PositionOffset, + func: F, + group: G, + accumulated_data: Vec, +} + +impl ConditionalMergingIterator +where + I: IntoIterator>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, + G: Fn(&GenomicRangeRecord, &GenomicRangeRecord) -> bool, +{ + pub fn new(inner: I, minimum_distance: PositionOffset, func: F, group: G) -> Self { + Self { + last_range: None, + inner: inner.into_iter(), + minimum_distance, + func, + group, + accumulated_data: Vec::new(), + } + } +} + +impl Iterator for ConditionalMergingIterator +where + GenomicRangeRecord: GenericRange, + GenomicRangeRecord: GenericRange, + I: IntoIterator>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, + G: Fn(&GenomicRangeRecord, &GenomicRangeRecord) -> bool, +{ + type Item = GenomicRangeRecord; + + fn next(&mut self) -> Option { + for next_range in self.inner.by_ref() { + if let Some(ref mut last_range) = self.last_range { + // If we have an additional group-by merging condition, + // check that. If not set this is always true. + let satifies_groupby = (self.group)(last_range, &next_range); + + let on_same_chrom = last_range.seqname == next_range.seqname; + if on_same_chrom + && satifies_groupby + && last_range.distance_or_overlap(&next_range) <= self.minimum_distance + { + // this range overlaps the last range, so we keep accumulating data + last_range.end = max(last_range.end, next_range.end); + self.accumulated_data.push(next_range.data); + continue; + } else { + // New range does not overlap the last range, so we have to finalize. + // First, run function on accumulated taken data. + let final_data = (self.func)(std::mem::take(&mut self.accumulated_data)); + let return_range = GenomicRangeRecord { + seqname: last_range.seqname.clone(), + start: last_range.start, + end: last_range.end, + data: final_data, + }; + // Next push this new range's data to the data stack (empty after take) + self.accumulated_data.push(next_range.data.clone()); + self.last_range = Some(next_range); + // Push the new possibly-extend range with accumulated data. + return Some(return_range); + } + } else { + // There is no last range -- this must be the first. + self.last_range = Some(GenomicRangeRecord { + seqname: next_range.seqname, + start: next_range.start, + end: next_range.end, + data: next_range.data.clone(), + }); + self.accumulated_data.push(next_range.data); + continue; + } + } + + if let Some(last_range) = self.last_range.take() { + if !self.accumulated_data.is_empty() { + // Finalize any accumulated data + let final_data = (self.func)(std::mem::take(&mut self.accumulated_data)); + return Some(GenomicRangeRecord { + seqname: last_range.seqname, + start: last_range.start, + end: last_range.end, + data: final_data, + }); + } + } + + None // Return None when all items have been processed + } +} + /// A merging iterator (over [`Result, GRangesError`] items, /// i.e. from a parsing iterator) that has an additional merge condition tested /// by function `G` based on the items. diff --git a/src/traits.rs b/src/traits.rs index a2d062b..43379de 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -79,6 +79,12 @@ pub trait GenericRange: Clone { overlap_end.saturating_sub(overlap_start) } + /// Calculate how many basepairs overlap this range and other. + fn has_overlap_with(&self, other: &R) -> bool { + // TODO/OPTIMIZE inefficient FIXME + self.overlap_width(other) > 0 + } + fn distance_or_overlap(&self, other: &R) -> PositionOffset { if self.end() > other.start() && self.start() < other.end() { // The ranges overlap -- return how much as a negative number @@ -308,7 +314,7 @@ pub trait IndexedDataContainer: DataContainer { where Self: 'a; type OwnedItem; - type Output; + // type Output; // this type is needed to reference the core underlying type, // eg to handle reference types fn is_valid_index(&self, index: usize) -> bool; @@ -328,8 +334,8 @@ pub trait IndexedDataContainer: DataContainer { invalid } - /// Create a new data container using a set of range indices. - fn new_from_indices(&self, indices: &[usize]) -> Self::Output; + ///// Create a new data container using a set of range indices. + // fn new_from_indices(&self, indices: &[usize]) -> Self::Output; } /// The Sequences trait defines generic functionality for diff --git a/src/unique_id.rs b/src/unique_id.rs new file mode 100644 index 0000000..cbb7cea --- /dev/null +++ b/src/unique_id.rs @@ -0,0 +1,210 @@ +//! The [`UniqueIdentifier`] type, which stores a mapping between +//! some key type and a `usize` index. +//! +//! Note that these are not thread-safe currently but +//! it is relatively easy to do -- submit a GitHub +//! issue if you need this feature. + +use std::{ + cell::RefCell, + collections::{hash_map::Keys, HashMap}, + hash::Hash, +}; + +use crate::traits::{DataContainer, IndexedDataContainer}; + +impl DataContainer for UniqueIdentifier {} +impl IndexedDataContainer for UniqueIdentifier { + type Item<'a> = K where K: 'a; + type OwnedItem = K; + + fn get_value(&self, index: usize) -> ::Item<'_> { + self.get_key(index).unwrap().clone() + } + + fn len(&self) -> usize { + self.len() + } + + fn get_owned(&self, index: usize) -> ::OwnedItem { + self.get_value(index).clone() + } + + fn is_valid_index(&self, index: usize) -> bool { + self.ensure_reverse_map_created(); + self.reverse_map + .borrow() + .as_ref() + .unwrap() + .contains_key(&index) + } +} + +#[derive(Clone, Debug)] +pub struct UniqueIdentifier { + map: HashMap, + next_key: usize, + reverse_map: RefCell>>, +} + +impl Default for UniqueIdentifier +where + K: std::cmp::Eq + Hash + Clone, +{ + fn default() -> Self { + Self::new() + } +} + +impl UniqueIdentifier +where + K: std::cmp::Eq + Hash + Clone, +{ + pub fn new() -> Self { + Self { + map: HashMap::new(), + next_key: 0, + reverse_map: RefCell::new(None), + } + } + + pub fn get_or_insert(&mut self, s: &K) -> usize { + match self.map.get(s) { + Some(&key) => key, + None => { + let key = self.next_key; + self.map.insert(s.clone(), key); + self.next_key += 1; + // we need to invalidate the cached reverse map + let mut reverse_map_borrow = self.reverse_map.borrow_mut(); + *reverse_map_borrow = None; + key + } + } + } + + fn ensure_reverse_map_created(&self) { + let mut reverse_map_borrow = self.reverse_map.borrow_mut(); + if reverse_map_borrow.is_none() { + let reverse_map = self.map.iter().map(|(k, v)| (*v, k.clone())).collect(); + *reverse_map_borrow = Some(reverse_map); + } + } + + pub fn get_key(&self, index: usize) -> Option { + self.ensure_reverse_map_created(); + self.reverse_map + .borrow() + .as_ref() + .unwrap() + .get(&index) + .cloned() + } + + pub fn is_valid_index(&self, index: usize) -> bool { + self.ensure_reverse_map_created(); + self.reverse_map + .borrow() + .as_ref() + .unwrap() + .contains_key(&index) + } + + pub fn keys(&self) -> Keys<'_, K, usize> { + self.map.keys() + } + + pub fn indices(&self) -> Vec { + self.ensure_reverse_map_created(); + self.reverse_map + .borrow() + .as_ref() + .unwrap() + .keys() + .cloned() + .collect() + } + + pub fn get_index(&self, s: &K) -> Option { + self.map.get(s).cloned() + } + + pub fn len(&self) -> usize { + self.map.len() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_insert_and_get_key() { + let mut ui = UniqueIdentifier::new(); + let key = "test_key"; + let index = ui.get_or_insert(&key); + + assert_eq!(index, 0); + assert_eq!(ui.get_key(index), Some(key)); + } + + #[test] + fn test_is_valid_index() { + let mut ui = UniqueIdentifier::new(); + let key = "another_key"; + ui.get_or_insert(&key); + + assert!(ui.is_valid_index(0)); + assert!(!ui.is_valid_index(1)); + + let key = "another!"; + let index = ui.get_or_insert(&key); + assert_eq!(index, 1); + assert!(ui.is_valid_index(index)); + } + + #[test] + fn test_len() { + let mut ui = UniqueIdentifier::new(); + assert_eq!(ui.len(), 0); + + ui.get_or_insert(&"key1"); + assert_eq!(ui.len(), 1); + + ui.get_or_insert(&"key2"); + assert_eq!(ui.len(), 2); + + // Inserting an existing key should not increase the length + ui.get_or_insert(&"key1"); + assert_eq!(ui.len(), 2); + } + + #[test] + fn test_indices() { + let mut ui = UniqueIdentifier::new(); + ui.get_or_insert(&"key1"); + ui.get_or_insert(&"key2"); + + let indices = ui.indices(); + assert_eq!(indices.len(), 2); + assert!(indices.contains(&0)); + assert!(indices.contains(&1)); + } + + #[test] + fn test_get_index() { + let mut ui = UniqueIdentifier::new(); + let key = "key_to_find"; + ui.get_or_insert(&key); + + let index = ui.get_index(&key); + assert_eq!(index, Some(0)); + + let missing_key = "missing_key"; + assert_eq!(ui.get_index(&missing_key), None); + } +}