diff --git a/scripts/benchmark_summary.py b/scripts/benchmark_summary.py index 110d27b..73f2c23 100644 --- a/scripts/benchmark_summary.py +++ b/scripts/benchmark_summary.py @@ -40,7 +40,7 @@ def calculate_ratios(criterion_dir): percent_faster = ((bedtools_time - granges_time) / bedtools_time) * 100 # Format the output to show 3 decimal places print( - f"{comparison} - Granges is {percent_faster:.3f}% faster than Bedtools" + f"{comparison} - GRanges is {percent_faster:.3f}% faster than Bedtools" ) diff --git a/src/granges.rs b/src/granges.rs index 90c064f..9d54b31 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -58,6 +58,7 @@ use std::{collections::HashSet, path::PathBuf}; +use coitrees::IntervalNode; use genomap::GenomeMap; use indexmap::IndexMap; @@ -65,6 +66,7 @@ use crate::{ ensure_eq, io::OutputFile, iterators::GRangesIterator, + join::{JoinData, LeftGroupedJoin}, prelude::GRangesError, ranges::{ coitrees::{COITrees, COITreesEmpty, COITreesIndexed}, @@ -198,9 +200,10 @@ where }); let mut writer = output.writer()?; + let data_ref = self.data.as_ref().ok_or(GRangesError::NoDataContainer)?; let seqnames = self.seqnames(); for range in self.iter_ranges() { - let record = range.to_record(&seqnames, self.data.as_ref().unwrap()); + let record = range.to_record(&seqnames, data_ref); writeln!(writer, "{}", record.to_tsv())?; } Ok(()) @@ -293,6 +296,7 @@ where let mut gr: GRanges = GRanges::new_vec(&self.seqlens()); let seqlens = self.seqlens(); for (seqname, ranges) in self.ranges.iter() { + // unwrap should be safe, since seqname is produced from ranges iterator. let seqlen = seqlens.get(seqname).unwrap(); for range in ranges.iter_ranges() { let flanking_ranges = range.flanking_ranges::(left, right, *seqlen); @@ -317,6 +321,9 @@ impl GRangesEmpty> { GRangesEmpty(GRanges::new_vec(seqlens)) } + /// Sort the ranges by position for this [`GRangesEmpty`] object. + /// + /// This operation is consuming and returns the sorted [`GRangesEmpty`] object. pub fn sort(self) -> Self { GRangesEmpty(self.0.sort()) } @@ -326,6 +333,85 @@ impl GRangesEmpty> { } } +impl GRangesEmpty { + /// Make a [`GRangesEmpty`] with ranges from (possibly overlapping) windows. + /// + /// # Arguments + /// * `seqlens`: the sequence (e.g. chromosome) lengths. + /// * `width`: the window width, in basepairs. + /// * `chop`: whether to cut off the last window, if there is a remainder less than the width. + /// * `step`: the step length, in basepairs; if None, step is `width`. + /// + /// # Examples + /// + /// ``` + /// use granges::prelude::*; + /// + /// let sl = seqlens!( "chr1" => 11); + /// + /// // no step don't chop off remainder + /// let gr = GRangesEmpty::from_windows(&sl, 5, None, false).unwrap(); + /// + /// let mut range_iter = gr.iter_ranges(); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (0, 5, None)); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (5, 10, None)); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (10, 11, None)); + /// + /// // no step do chop off remainder + /// let gr = GRangesEmpty::from_windows(&sl, 5, None, true).unwrap(); + /// + /// let mut range_iter = gr.iter_ranges(); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (0, 5, None)); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (5, 10, None)); + /// + /// // with step don't chop off remainder + /// let gr = GRangesEmpty::from_windows(&sl, 5, Some(2), false).unwrap(); + /// + /// let mut range_iter = gr.iter_ranges(); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (0, 5, None)); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (2, 7, None)); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (4, 9, None)); + /// assert_eq!(range_iter.next().unwrap().as_tuple(), (6, 11, None)); + /// + /// ``` + pub fn from_windows( + seqlens: &IndexMap, + width: u32, + step: Option, + chop: bool, + ) -> Result, GRangesError> { + let mut gr: GRangesEmpty = GRangesEmpty::new_vec(seqlens); + + // have we encountered a remainder chunk? + + let mut remainder = false; + // iterate over each chromosome and create windows + for (seqname, len) in seqlens { + let mut start = 0; + while start < *len { + let mut end = start + width - 1; + + if end >= *len { + if chop { + break; + } else { + end = std::cmp::min(end, len - 1); + } + } + if end - start + 1 < width { + if remainder { + break; + } + remainder = true; + } + gr.push_range(seqname, start, end + 1)?; + start += step.unwrap_or(width); + } + } + Ok(gr) + } +} + impl GRangesEmpty> { pub fn adjust_ranges(self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self { GRangesEmpty(self.0.adjust_ranges(start_delta, end_delta)) @@ -357,6 +443,7 @@ where let mut gr: GRangesEmpty = GRangesEmpty::new_vec(&self.seqlens()); let seqlens = self.seqlens(); for (seqname, ranges) in self.0.ranges.iter() { + // unwrap should be safe, since seqname is produced from ranges iterator. let seqlen = seqlens.get(seqname).unwrap(); for range in ranges.iter_ranges() { let flanking_ranges = range.flanking_ranges::(left, right, *seqlen); @@ -381,10 +468,51 @@ impl GRanges> { if self.data.is_none() { self.data = Some(Vec::new()); } + let data_ref = self.data.as_mut().ok_or(GRangesError::NoDataContainer)?; + // push data to the vec data container, getting the index + let index: usize = { + data_ref.push(data); + data_ref.len() - 1 // new data index + }; + // push an indexed range + let range = RangeIndexed::new(start, end, index); + let range_container = self + .ranges + .get_mut(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + range_container.push_range(range); + Ok(()) + } +} + +impl<'a, DL, DR> GRanges> { + /// Push a genomic range with its data to the range and data containers in a [`GRanges] object. + /// + /// Note that this has slightly different behavior than other [`GRanges.push_range()`] + /// methods, in that it requires that the [`JoinData`] object be initialized first. + /// This is because were this not to be the case, each call to `push_ranges()` would + /// require references to the right and left data containers. This is not ergonomic. + /// + /// # Panics + /// This will panic if the developer did not initialize the new [`JoinData`] + /// correctly. This is not handled with a user error since it reflects a + /// developer error. + pub fn push_range_with_join( + &mut self, + seqname: &str, + start: Position, + end: Position, + data: LeftGroupedJoin, + ) -> Result<(), GRangesError> { + if self.data.is_none() { + // unlike push_range() + panic!("Internal error: JoinData not initialized."); + } + let data_ref = self.data.as_mut().ok_or(GRangesError::NoDataContainer)?; // push data to the vec data container, getting the index let index: usize = { - self.data.as_mut().unwrap().push(data); - self.data.as_mut().unwrap().len() - 1 // new data index + data_ref.push(data); + data_ref.len() - 1 // new data index }; // push an indexed range let range = RangeIndexed::new(start, end, index); @@ -403,7 +531,8 @@ where T: TsvSerialize, >::Item: TsvSerialize, { - /// + /// Write this [`GRanges`] object to a TSV file, using the + /// [`TsvSerialize`] trait methods defiend for the items in `T`. pub fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { // output stream -- header is None for now (TODO) let output = output.map_or(OutputFile::new_stdout(None), |file| { @@ -412,12 +541,49 @@ where let mut writer = output.writer()?; let seqnames = self.seqnames(); + let data_ref = self.data.as_ref().ok_or(GRangesError::NoDataContainer)?; for range in self.iter_ranges() { - let record = range.to_record(&seqnames, self.data.as_ref().unwrap()); + let record = range.to_record(&seqnames, data_ref); writeln!(writer, "{}", record.to_tsv())?; } Ok(()) } + + /// Conduct a left overlap join, consuming self and returning a new + /// [`GRanges>`]. + /// + /// The [`JoinData`] container contains references to both left and right + /// data containers and a [`Vec`]. Each [`OverlapJoin`] represents + /// a summary of an overlap, which downstream operations use to calculate + /// statistics using the information about overlaps. + pub fn left_overlaps( + self, + right: &'a impl AsGRangesRef<'a, COITrees, DR>, + ) -> Result, JoinData<'a, T, DR>>, GRangesError> + where + IntervalNode: GenericRange, + { + let mut gr: GRanges> = + GRanges::new_vec(&self.seqlens()); + + let right_ref = right.as_granges_ref(); + gr.data = Some(JoinData::new(self.data, right_ref.data.as_ref())); + + for (seqname, left_ranges) in self.ranges.iter() { + for left_range in left_ranges.iter_ranges() { + // Left join: every left range gets a JoinData. + let mut join_data = LeftGroupedJoin::new(&left_range); + if let Some(right_ranges) = right_ref.ranges.get(seqname) { + right_ranges.query(left_range.start(), left_range.end(), |right_range| { + // NOTE: right_range is a coitrees::IntervalNode. + join_data.add_right(&left_range, right_range); + }); + } + gr.push_range_with_join(seqname, left_range.start, left_range.end, join_data)?; + } + } + Ok(gr) + } } impl<'a, C, T> GRanges @@ -478,6 +644,44 @@ impl GRanges, T> { } } +impl<'a> GRangesEmpty { + /// Conduct a left overlap join, consuming self and returning a new + /// [`GRanges>`]. + /// + /// The [`JoinData`] container contains references to both left and right + /// data containers and a [`Vec`]. Each [`OverlapJoin`] represents + /// a summary of an overlap, which downstream operations use to calculate + /// statistics using the information about overlaps. + pub fn left_overlaps( + self, + right: &'a impl AsGRangesRef<'a, COITrees, DR>, + ) -> Result, JoinData<'a, (), DR>>, GRangesError> + where + IntervalNode: GenericRange, + { + let mut gr: GRanges> = + GRanges::new_vec(&self.seqlens()); + + let right_ref = right.as_granges_ref(); + gr.data = Some(JoinData::new(None, right_ref.data.as_ref())); + + for (seqname, left_ranges) in self.0.ranges.iter() { + for left_range in left_ranges.iter_ranges() { + // Left join: every left range gets a JoinData. + let mut join_data = LeftGroupedJoin::new(&left_range); + if let Some(right_ranges) = right_ref.ranges.get(seqname) { + right_ranges.query(left_range.start(), left_range.end(), |right_range| { + // NOTE: right_range is a coitrees::IntervalNode. + join_data.add_right(&left_range, right_range); + }); + } + gr.push_range_with_join(seqname, left_range.start, left_range.end, join_data)?; + } + } + Ok(gr) + } +} + impl GRanges> { /// Create a new [`GRanges>`] object from an iterator over /// [`GenomicRangeRecord`] records. @@ -625,14 +829,13 @@ where ) -> Result>, GRangesError> { let mut gr: GRanges> = GRanges::new_vec(&self.seqlens()); - let data = std::mem::take(&mut self.data).unwrap(); + let right_ref = right.as_granges_ref(); + let data = std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)?; let mut old_indices = HashSet::new(); // the old indices to *keep* let mut new_indices = Vec::new(); - let mut current_index = 0; - let right_ref = right.as_granges_ref(); for (seqname, left_ranges) in self.ranges.iter() { for left_range in left_ranges.iter_ranges() { if let Some(right_ranges) = right_ref.ranges.get(seqname) { @@ -647,6 +850,7 @@ where left_range.end(), current_index, )?; + // unwrap should be safe, since this is an indexed GRanges old_indices.insert(left_range.index().unwrap()); new_indices.push(current_index); current_index += 1; @@ -690,11 +894,51 @@ where } } +/// [`PartialEq`] for [`GRanges`] objects. +/// +/// This is a more powerful comparison operator than [`GRanges.is_equal_to()`], since it will first +/// convert one type of ranges to another. +/// +/// A [`GRanges`] object is considered equal to another if: +/// 1. Same number or ranges (this is checked first for efficiency). +/// 2. Same metadata. +/// 3. Same ranges. This requires converting ranges from different range container map types to a +/// [`VecRanges`]. **Warning**: this is currently memory intensive. +/// +/// # Developer Notes +/// This uses type conversion in the range container types; see +/// [`crate::granges::ranges::comparison.rs`]. +impl PartialEq> for GRanges +where + CL: IterableRangeContainer + PartialEq, + CR: IterableRangeContainer + PartialEq, + DL: PartialEq, +{ + fn eq(&self, other: &GRanges) -> bool { + if self.len() != other.len() { + return false; + } + + let data_matches = match (&self.data, &other.data) { + (Some(self_data), Some(other_data)) => self_data == other_data, + (None, None) => true, + _ => false, + }; + + let self_ranges = self.iter_ranges(); + let other_ranges = other.iter_ranges(); + let ranges_eq = self_ranges.zip(other_ranges).all(|(r1, r2)| r1 == r2); + ranges_eq && data_matches + } +} + #[cfg(test)] mod tests { use crate::{ + join::LeftGroupedJoin, prelude::*, test_utilities::{granges_test_case_01, granges_test_case_02, random_vecranges}, + Position, }; #[test] @@ -799,4 +1043,135 @@ mod tests { let second_range = gr_left_iter.next().unwrap(); assert_eq!(second_range.end(), 210); } + + #[test] + fn test_from_windows() { + let sl = seqlens!( "chr1" => 35); + // Test chop + let gr = GRangesEmpty::from_windows(&sl, 10, None, true).unwrap(); + assert_eq!(gr.len(), 3, "{:?}", gr); + dbg!(&gr); + gr.iter_ranges().for_each(|r| assert_eq!(r.width(), 10)); + + // Test no chop + let gr = GRangesEmpty::from_windows(&sl, 10, None, false).unwrap(); + assert_eq!(gr.iter_ranges().last().unwrap().width(), 5); + + // Test sliding with step of 2, with chop + let sl = seqlens!( "chr1" => 21); + let gr = GRangesEmpty::from_windows(&sl, 10, Some(2), true).unwrap(); + let mut expected_ranges_chop: Vec<(String, Position, Position)> = vec![ + ("chr1", 0, 10), + ("chr1", 2, 12), + ("chr1", 4, 14), + ("chr1", 6, 16), + ("chr1", 8, 18), + ("chr1", 10, 20), + ] + .into_iter() + .map(|(seq, s, e)| (seq.to_string(), s, e)) + .collect(); + let seqnames = sl.keys().map(|x| x.to_string()).collect::>(); + let actual_ranges: Vec<(String, Position, Position)> = gr + .iter_ranges() + .map(|r| (r.seqname(&seqnames), r.start(), r.end())) + .collect(); + assert_eq!(actual_ranges, expected_ranges_chop); + + // The same as above, without chop -- goes to seqlen with little remainder. + let gr = GRangesEmpty::from_windows(&sl, 10, Some(2), false).unwrap(); + expected_ranges_chop.push(("chr1".to_string(), 12, 21)); + let actual_ranges: Vec<(String, Position, Position)> = gr + .iter_ranges() + .map(|r| (r.seqname(&seqnames), r.start(), r.end())) + .collect(); + + assert_eq!(actual_ranges, expected_ranges_chop); + } + + #[test] + fn test_left_overlaps() { + let sl = seqlens!("chr1" => 50); + let windows: GRangesEmpty = + GRangesEmpty::from_windows(&sl, 10, None, true).unwrap(); + + let windows_len = windows.len(); + + let mut right_gr: GRanges> = GRanges::new_vec(&sl); + right_gr.push_range("chr1", 1, 2, 1.1).unwrap(); + right_gr.push_range("chr1", 5, 7, 2.8).unwrap(); + right_gr.push_range("chr1", 21, 35, 1.2).unwrap(); + right_gr.push_range("chr1", 23, 24, 2.9).unwrap(); + let right_gr = right_gr.into_coitrees().unwrap(); + + let joined_results = windows.left_overlaps(&right_gr).unwrap(); + + // get join data + let data = joined_results.data.unwrap(); + + // check is left join + assert_eq!(data.len(), windows_len); + + let mut join_iter = data.iter(); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 2); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 0); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 2); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 1); + // rest are empty TODO should check + } + + #[test] + fn test_partial_eq() { + // check equality case + let gr1 = granges_test_case_01(); + let gr2 = granges_test_case_01(); + assert_eq!(gr1, gr2); + + // check minor difference case + let sl = seqlens!( "chr1" => 30, "chr2" => 100 ); + let mut gr3 = GRanges::>::new_vec(&sl); + gr3.push_range("chr1", 0, 4, 0.1).unwrap(); // one bp diff end + gr3.push_range("chr1", 4, 7, 8.1).unwrap(); + gr3.push_range("chr1", 10, 17, 10.1).unwrap(); + gr3.push_range("chr2", 10, 20, 3.7).unwrap(); + gr3.push_range("chr2", 18, 32, 1.1).unwrap(); + + assert_ne!(gr1, gr3); + + // check differential order case + let sl = seqlens!( "chr1" => 30, "chr2" => 100 ); + let mut gr4 = GRanges::>::new_vec(&sl); + gr4.push_range("chr1", 4, 7, 8.1).unwrap(); + gr4.push_range("chr1", 0, 5, 1.1).unwrap(); // swapped with above + gr4.push_range("chr1", 10, 17, 10.1).unwrap(); + gr4.push_range("chr2", 10, 20, 3.7).unwrap(); + gr4.push_range("chr2", 18, 32, 1.1).unwrap(); + + assert_ne!(gr3, gr4); + + // check manual case (for typos) + let sl = seqlens!( "chr1" => 30, "chr2" => 100 ); + let mut gr5 = GRanges::>::new_vec(&sl); + gr5.push_range("chr1", 0, 5, 1.1).unwrap(); + gr5.push_range("chr1", 4, 7, 8.1).unwrap(); + gr5.push_range("chr1", 10, 17, 10.1).unwrap(); + gr5.push_range("chr2", 10, 20, 3.7).unwrap(); + gr5.push_range("chr2", 18, 32, 1.1).unwrap(); + + assert_eq!(gr1, gr5) + } + + #[test] + fn test_is_equal_to() { + let vec_orig = granges_test_case_01(); + assert_eq!(vec_orig, vec_orig); + + let mut vec = vec_orig.clone(); + vec.push_range("chr1", 0, 10, 3.4).unwrap(); + assert_ne!(vec_orig, vec); + + let vec = vec.sort(); + let coit = vec.clone().into_coitrees().unwrap(); + assert_eq!(coit, vec); + } } diff --git a/src/join.rs b/src/join.rs index 84146c4..ed845c4 100644 --- a/src/join.rs +++ b/src/join.rs @@ -1,16 +1,13 @@ #![allow(clippy::all)] -use genomap::GenomeMap; +use crate::{traits::GenericRange, Position}; -use crate::{ - granges::GRanges, - iterators::GRangesIterator, - ranges::coitrees::COITreesIndexed, - traits::{GenericRange, IterableRangeContainer}, - Position, -}; - -pub struct JoinData { +/// [`LeftGroupedJoin`] contains information about the right ranges +/// and their degree of overlap with a focal left range. This information +/// is designed to facilitate downstream statistical sumamries of the +/// corresponding data in overlapping ranges. +#[derive(Clone, Debug, PartialEq)] +pub struct LeftGroupedJoin { /// The data index for the left range. left: Option, @@ -30,9 +27,13 @@ pub struct JoinData { // representing the side and degree of non-overlap. E.g. a range // that overlaps another but overhangs the 3' side of the focal left // range by 10bp is +10; if it were 5', it would be -10. + + // left_data: Option<&'a DL>, + // right_data: Option<&'a DR>, } -impl JoinData { +impl LeftGroupedJoin { + /// Create a new [`LeftGroupedJoin`]. pub fn new(left_range: &R) -> Self { Self { left: left_range.index(), @@ -40,69 +41,128 @@ impl JoinData { left_length: left_range.width(), right_lengths: Vec::new(), overlaps: Vec::new(), + // left_data, + // right_data, } } + /// Add a right range to this [`LeftGroupedJoin`]. pub fn add_right(&mut self, left: &R, right: &Q) { self.rights.push(right.index()); self.right_lengths.push(right.width()); self.overlaps.push(left.overlap_width(right)); } + /// Return whether this left range has any [`LeftGroupedJoin`]. + pub fn has_overlaps(&self) -> bool { + !self.overlaps.is_empty() + } + + /// Retrieve the number of right overlaps. + pub fn num_overlaps(&self) -> usize { + self.overlaps.len() + } } -pub struct JoinIterator<'a, CL, DL, DR> -where - CL: IterableRangeContainer, -{ - seqnames: Vec, - left_iter: GRangesIterator<'a, CL>, - right_genomic_ranges: &'a GenomeMap, - left_data: Option<&'a DL>, - right_data: Option<&'a DR>, +/// [`JoinData`] contains a [`Vec`] of all overlap +/// joins, as well as references to the left and right data containers. +#[derive(Clone, Debug)] +pub struct JoinData<'a, DL, DR> { + pub joins: Vec, + pub left_data: Option
, + pub right_data: Option<&'a DR>, } -impl<'a, CL, DL: 'a, DR> JoinIterator<'a, CL, DL, DR> -where - CL: IterableRangeContainer, -{ - pub fn new(left: &'a GRanges, right: &'a GRanges) -> Self { - let seqnames = left.seqnames(); - let left_iter = left.iter_ranges(); - let left_data = left.data.as_ref(); - let right_data = right.data.as_ref(); - let right_ranges = &right.ranges; - Self { - seqnames, - left_iter, - right_genomic_ranges: right_ranges, +impl<'a, DL, DR> JoinData<'a, DL, DR> { + /// Create a new [`JoinData`]. + pub fn new(left_data: Option
, right_data: Option<&'a DR>) -> Self { + let joins = Vec::new(); + JoinData { + joins, left_data, right_data, } } + + /// Push the [`LeftGroupedJoin`] to joins. + pub fn push(&mut self, join: LeftGroupedJoin) { + self.joins.push(join) + } + + /// Get the total number of joins. + pub fn len(&self) -> usize { + self.joins.len() + } + + /// Return whether the [`JoinData`] object is empty (contains no ranges). + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Create an iterator over the joins. + pub fn iter(&'a self) -> JoinDataIterator<'a, DL, DR> { + JoinDataIterator { + inner: self.joins.iter(), + left_data: self.left_data.as_ref(), + right_data: self.right_data, + } + } } -impl<'a, CL, DL, DR> Iterator for JoinIterator<'a, CL, DL, DR> -where - CL: IterableRangeContainer, -{ - type Item = JoinData; +pub struct JoinDataIterator<'a, DL, DR> { + inner: std::slice::Iter<'a, LeftGroupedJoin>, + pub left_data: Option<&'a DL>, + pub right_data: Option<&'a DR>, +} + +impl<'a, DL, DR> Iterator for JoinDataIterator<'a, DL, DR> { + type Item = &'a LeftGroupedJoin; + fn next(&mut self) -> Option { - if let Some(left_range) = self.left_iter.next() { - let seqname = &self.seqnames[left_range.seqname_index]; - let mut join_data = JoinData::new(&left_range); - if let Some(right_ranges) = self.right_genomic_ranges.get(&seqname) { - right_ranges.query(left_range.start, left_range.end, |interval| { - join_data.add_right(&left_range, interval); - }); - } - Some(join_data) - } else { - None - } + self.inner.next() } } -// -// -// pub fn left_join(left: GRanges, right: GRanges) -> JoinIterator { -// -// } +#[cfg(test)] +mod tests { + use crate::ranges::RangeIndexed; + + use super::{JoinData, LeftGroupedJoin}; + + #[test] + fn test_join_data_new() { + let left_data = vec![1, 2]; + let right_data = vec![4, 8]; + let mut jd = JoinData::new(Some(&left_data), Some(&right_data)); + assert_eq!(jd.len(), 0); + + let left = RangeIndexed::new(0, 10, 1); + let mut join = LeftGroupedJoin::new(&left); + let right = RangeIndexed::new(8, 10, 1); + join.add_right(&left, &right); + jd.push(join); + assert_eq!(jd.len(), 1); + } + + #[test] + fn test_join_iter() { + let left_data = vec![1, 2]; + let right_data = vec![4, 8]; + + let mut jd = JoinData::new(Some(&left_data), Some(&right_data)); + + let left = RangeIndexed::new(0, 10, 1); + let mut join = LeftGroupedJoin::new(&left); + let right = RangeIndexed::new(8, 10, 1); + join.add_right(&left, &right); + jd.push(join); + + let right = RangeIndexed::new(9, 11, 1); + let mut join = LeftGroupedJoin::new(&left); + join.add_right(&left, &right); + jd.push(join); + + let mut iter = jd.iter(); + assert_eq!(iter.next().unwrap().num_overlaps(), 1); + assert_eq!(iter.next().unwrap().num_overlaps(), 1); + assert_eq!(iter.next(), None); + } +} diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index cd4c78b..f355d53 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -206,6 +206,20 @@ impl GenericRange for IntervalNode { } } +impl PartialEq for COITrees { + fn eq(&self, other: &Self) -> bool { + if self.ranges.len() != other.ranges.len() { + return false; + } + let self_ranges = self.ranges.iter(); + let other_ranges = other.ranges.iter(); + self_ranges.zip(other_ranges).all(|(r1, r2)| { + // NOTE: we really should just implement PartialEq here in coitrees + (r1.first == r2.first) && (r1.last == r2.last) && (r1.metadata == r2.metadata) + }) + } +} + #[cfg(test)] mod tests { use coitrees::{GenericInterval, Interval}; diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 5841a61..42a11a0 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -379,6 +379,11 @@ impl GenomicRangeIndexedRecord { index, } } + /// Using the *corresponding ordered* sequence names the `seqname_index` indices + /// correspond to, get the sequence name. + pub fn seqname(&self, seqnames: &[String]) -> String { + seqnames[self.seqname_index].clone() + } pub fn to_record<'a, T>( self, seqnames: &[String], @@ -490,6 +495,12 @@ mod tests { assert_eq!(range_a.overlap_range(&range_b), Some((5, 6))); } + #[test] + fn test_width() { + let range_a = RangeEmpty::new(5, 8); + assert_eq!(range_a.width(), 3); + } + #[test] fn test_overlap_width() { let range_a = RangeEmpty::new(0, 2); diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index 1cbcc40..7213b0f 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -9,7 +9,7 @@ use crate::{error::GRangesError, traits::RangeContainer, Position}; pub type VecRangesIndexed = VecRanges; pub type VecRangesEmpty = VecRanges; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub struct VecRanges { pub(crate) ranges: Vec, pub length: Position, diff --git a/src/traits.rs b/src/traits.rs index bdf2470..812bbf4 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -29,21 +29,13 @@ pub trait GenomicRangesTsvSerialize<'a, C: RangeContainer> { fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError>; } -///// Traits for [`GRanges`] types that can be built from an iterator. -//pub trait GenomicRangesOperationsExtended { -// type DataContainerType; -// type DataElementType; -// /// Build a new [`GRanges`] object from an iterator. -// fn from_iter(iter: I, seqlens: &IndexMap) -> Result, GRangesError> where I: Iterator, GRangesError>>; -//} - /// The [`GenericRange`] trait defines common functionality for all range types. pub trait GenericRange: Clone { fn start(&self) -> Position; fn end(&self) -> Position; fn index(&self) -> Option; fn width(&self) -> Position { - self.end() - self.start() - 1 + self.end() - self.start() } /// Calculate how many basepairs overlap this range and other. fn overlap_width(&self, other: &R) -> Position { @@ -66,6 +58,11 @@ pub trait GenericRange: Clone { None } } + + /// Return a tuple version of this range. + fn as_tuple(&self) -> (Position, Position, Option) { + (self.start(), self.end(), self.index()) + } } /// The [`GenericRangeOperations`] trait extends additional functionality to [`GenericRange`],