From 1b5e236d912721d854f2eeb90dcafeb2e7f01321 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Wed, 14 Feb 2024 21:16:12 -0800 Subject: [PATCH] Basic iterator trait, range pushing methods, to coitrees, and more! - tests and test utilies - rough join stuff - interval/range conversion methods - clippy & fmt - GitHub Rust workflow added --- .github/workflows/rust.yml | 22 +++++++ Cargo.toml | 4 +- src/error.rs | 6 ++ src/granges.rs | 117 +++++++++++++++++++++++++-------- src/iterators.rs | 11 ++++ src/join.rs | 43 ++++++++++++ src/lib.rs | 37 +++++++++-- src/ranges/coitrees.rs | 131 ++++++++++++++++++++++++++++++------- src/ranges/mod.rs | 98 +++++++++++++-------------- src/ranges/vec.rs | 27 +++++--- src/test_utilities.rs | 39 +++++++++-- src/traits.rs | 1 - 12 files changed, 409 insertions(+), 127 deletions(-) create mode 100644 .github/workflows/rust.yml create mode 100644 src/iterators.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..31000a2 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,22 @@ +name: Rust + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Build + run: cargo build --verbose + - name: Run tests + run: cargo test --verbose diff --git a/Cargo.toml b/Cargo.toml index d69a2d1..a4cd622 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "granges2" +name = "granges" version = "0.1.0" edition = "2021" @@ -7,7 +7,7 @@ edition = "2021" [dependencies] coitrees = { version = "0.4.0", features = ["nosimd"] } -genomap = "0.1.5" +genomap = "0.2.6" indexmap = "2.2.3" rand = "0.8.5" thiserror = "1.0.57" diff --git a/src/error.rs b/src/error.rs index 9f636f1..f66c007 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,3 +1,4 @@ +use genomap::GenomeMapError; use thiserror::Error; use crate::Position; @@ -10,4 +11,9 @@ pub enum GRangesError { #[error("Range [{0}, {1}] is invalid for sequence of length {2}")] InvalidGenomicRangeForSequence(Position, Position, Position), + #[error("Sequence name '{0}' is not the ranges container")] + MissingSequence(String), + + #[error("Error encountered in genomap::GenomeMap")] + GenomeMapError(#[from] GenomeMapError), } diff --git a/src/granges.rs b/src/granges.rs index 13c9365..f57b655 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -1,17 +1,27 @@ use genomap::GenomeMap; - -use crate::{traits::RangeContainer, ranges::{vec::{VecRanges, VecRangesIndexed, VecRangesEmpty}, RangeIndexed, RangeEmpty}, Position}; - - +use indexmap::IndexMap; + +use crate::{ + prelude::GRangesError, + ranges::{ + coitrees::{COITrees, COITreesIndexed}, + vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, + RangeEmpty, RangeIndexed, + }, + traits::RangeContainer, + Position, +}; + +#[derive(Clone)] pub struct GRanges { ranges: GenomeMap, data: Option, } - impl GRanges -where C: RangeContainer { - +where + C: RangeContainer, +{ /// Get the total number of ranges. pub fn len(&self) -> usize { self.ranges.values().map(|ranges| ranges.len()).sum() @@ -24,25 +34,35 @@ where C: RangeContainer { } impl GRanges> { - /// Create a new [`GRanges`] object, with vector storage for ranges and data. /// /// This combination of range and data containers is used when loading data into - /// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's - /// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`] + /// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's + /// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`] /// call the [`Vec`]'s shrink to size methods on the range and data containers /// after data loading to shrink to the minimal necessary size (this can reduce /// memory usage). - pub fn new_vec() -> Self { - let ranges = GenomeMap::new(); - Self { - ranges, - data: None, + pub fn new_vec(seqlens: &IndexMap) -> Self { + let mut ranges = GenomeMap::new(); + for (seqname, length) in seqlens.iter() { + // this should never happen because the error is only if + // insert encounters a seqname that's already been inserted -- that + // cannot happen here. + ranges + .insert(seqname, VecRanges::new(*length)) + .expect("Internal error: please report"); } + Self { ranges, data: None } } - - pub fn push_range_with_data(&mut self, seqname: &str, start: Position, end: Position, data: U) { + /// Push a genomic range with its data to the range and data containers in a [`GRanges] object. + pub fn push_range_with_data( + &mut self, + seqname: &str, + start: Position, + end: Position, + data: U, + ) -> Result<(), GRangesError> { // push data to the vec data container, getting the index let index: usize = { let data_container = self.data.get_or_insert_with(Vec::new); @@ -51,39 +71,72 @@ impl GRanges> { }; // push an indexed range let range = RangeIndexed::new(start, end, index); - self.ranges.entry_or_default(seqname).ranges.push(range); + let range_container = self + .ranges + .get_mut(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + range_container.push_range(range); + Ok(()) } } impl GRanges { - /// Create a new [`GRanges`] object, with vector storage for ranges and no data container. pub fn new_vec_empty() -> Self { let ranges = GenomeMap::new(); - Self { - ranges, - data: None, - } + Self { ranges, data: None } } /// Push an empty range (no data) to the [`VecRangesEmpty`] range container. - pub fn push_range_only(&mut self, seqname: &str, start: Position, end: Position) { + pub fn push_range_only( + &mut self, + seqname: &str, + start: Position, + end: Position, + ) -> Result<(), GRangesError> { // push an unindexed (empty) range let range = RangeEmpty::new(start, end); - self.ranges.entry_or_default(seqname).ranges.push(range); + let range_container = self + .ranges + .get_mut(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + range_container.push_range(range); + Ok(()) } } - +impl GRanges { + /// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree + /// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library + /// by Daniel C. Jones. + pub fn to_coitrees(self) -> Result, GRangesError> { + let old_ranges = self.ranges; + let mut new_ranges = GenomeMap::new(); + for (seqname, vec_ranges) in old_ranges.into_iter() { + let trees = COITrees::from(vec_ranges); + new_ranges.insert(&seqname, trees)?; + } + Ok(GRanges { + ranges: new_ranges, + data: self.data, + }) + } +} #[cfg(test)] mod tests { - use crate::{prelude::*, test_utilities::random_vecranges}; + use indexmap::indexmap; + + use crate::{ + prelude::*, + test_utilities::{granges_test_case_01, random_vecranges}, + }; #[test] fn test_new_vec() { - let mut gr = GRanges::new_vec(); - gr.push_range_with_data("chr1", 0, 10, 1.1); + let seqlens = indexmap! { "chr1".to_string() => 10}; + let mut gr = GRanges::new_vec(&seqlens); + gr.push_range_with_data("chr1", 0, 10, 1.1).unwrap(); assert_eq!(gr.len(), 1); } @@ -93,4 +146,10 @@ mod tests { assert_eq!(vr.len(), 100) } + #[test] + fn test_to_coitrees() { + let gr_vec = granges_test_case_01(); + let gr = gr_vec.clone().to_coitrees().unwrap(); + assert_eq!(gr.len(), 5); + } } diff --git a/src/iterators.rs b/src/iterators.rs new file mode 100644 index 0000000..58507fe --- /dev/null +++ b/src/iterators.rs @@ -0,0 +1,11 @@ +/// the [`rangesiterable`] trait defines common functionality for iterating over +/// the copyable range types. +pub trait RangesIterable { + fn iter_ranges(&self) -> Box + '_>; +} + +pub trait RangesIntoIterable { + fn into_iter_ranges(self) -> Box>; +} + + diff --git a/src/join.rs b/src/join.rs index 139597f..a381a51 100644 --- a/src/join.rs +++ b/src/join.rs @@ -1,2 +1,45 @@ +use std::rc::Rc; +use crate::Position; +pub struct JoinData { + /// The data index for the left range. + left: usize, + + /// A `Vec` of the indices for the overlapping right ranges. + rights: Vec, + + /// The length of the left range. + left_length: Position, + + /// The lengths of the right ranges. + right_lengths: Vec, + + /// The lengths of the overlaps between the left and right ranges. + overlaps: Vec, + + // TODO: we may want some simple summary of whether something is + // up or downstream. I think the cleanest summary is a signed integer + // representing the side and degree of non-overlap. E.g. a range + // that overlaps another but overhangs the 3' side of the focal left + // range by 10bp is +10; if it were 5', it would be -10. + /// A possible reference to the left data of type `T`. + left_data: Option>, + + /// A possible reference to the right data of type `T`. + right_data: Option>, +} + +pub struct JoinIterator { + left_data: Option>, + right_data: Option>, +} +// +// impl JoinIterator { +// +// } +// +// +// pub fn left_join(left: GRanges, right: GRanges) -> JoinIterator { +// +// } diff --git a/src/lib.rs b/src/lib.rs index cb1be7a..60177cb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,42 @@ +// Copyright (2024) Vince Buffalo +#![crate_name = "granges"] +#![doc(html_root_url = "https://docs.rs/granges/")] -pub mod traits; -pub mod ranges; -pub mod granges; pub mod error; +pub mod granges; +pub mod iterators; +pub mod join; +pub mod ranges; pub mod test_utilities; +pub mod traits; pub type Position = u32; pub mod prelude { - pub use crate::granges::GRanges; pub use crate::error::GRangesError; + pub use crate::granges::GRanges; + pub use crate::iterators::RangesIterable; + + pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; +} + +/// Create a new `GRanges` with sequence length information (used primarily for small examples) +#[macro_export] +macro_rules! create_granges_with_seqlens { + ($range_type:ty, $data_type:ty, { $($chr:expr => [$(($start:expr, $end:expr, $data:expr)),*]),* }, seqlens: { $($chr_len:expr => $len:expr),* }) => { + { + let mut seqlens = ::indexmap::IndexMap::new(); + $(seqlens.insert($chr_len.to_string(), $len);)* + + let mut gr: GRanges<$range_type, $data_type> = GRanges::new_vec(&seqlens); + + $( + $( + gr.push_range_with_data(&$chr.to_string(), $start, $end, $data).expect("Failed to push range"); + )* + )* + + gr + } + }; } diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index 2e9d041..c9311f4 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -1,45 +1,62 @@ -use coitrees::{Interval, BasicCOITree, IntervalTree, IntervalNode, GenericInterval}; +use coitrees::{BasicCOITree, GenericInterval, Interval, IntervalNode, IntervalTree}; -use crate::{Position, traits::RangeContainer, error::GRangesError}; +use crate::{error::GRangesError, iterators::{RangesIterable, RangesIntoIterable}, traits::RangeContainer, Position}; -use super::{vec::VecRanges, RangeIndexed, validate_range}; +use super::{validate_range, vec::VecRanges, RangeEmpty, RangeIndexed}; -type COITreeIntervalIndexed = Interval; +pub type COITreesIndexed = COITrees; + +impl GenericInterval<()> for RangeEmpty { + fn first(&self) -> i32 { + self.start.try_into().unwrap() + } + fn last(&self) -> i32 { + self.end.try_into().unwrap() + } + fn metadata(&self) -> &() { + &() + } +} impl GenericInterval for RangeIndexed { fn first(&self) -> i32 { - self.start().try_into().unwrap() + self.start.try_into().unwrap() } fn last(&self) -> i32 { - self.end().try_into().unwrap() + self.end.try_into().unwrap() } fn metadata(&self) -> &usize { - self.index() + &self.index } } /// A [`coitrees::BasicCOITree`] interval tree for a single sequence's ranges. /// -/// This is generic over the interval type, to handle the case where one -/// may want to do overlap operations on ranges without associated data in +/// This is generic over the interval type, to handle the case where one +/// may want to do overlap operations on ranges without associated data in /// a data container (e.g. ranges that just define megabase windwows). -pub struct COITreeRangeContainer { - ranges: BasicCOITree, +/// +pub struct COITrees { + pub(crate) ranges: BasicCOITree, /// The sequence length, used to validate new ranges. - length: Position, + pub length: Position, } -impl COITreeRangeContainer { +impl COITrees { + /// Validate a range, raising an error if it is invalid for some reason. pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { - let range = start..end; - validate_range(&range, self.length) + validate_range(start, end, self.length) } - pub fn query(&self, start: Position, end: Position, visit: F) - where F: FnMut(&IntervalNode) { + /// Query this range container for a particular range, and call a visit function on all + /// overlapping ranges. + pub fn query(&self, start: Position, end: Position, visit: F) + where + F: FnMut(&IntervalNode), + { // Note the terminology change to match coitrees (and uses i32s) let first = start.try_into().expect("could not covert"); - let end: i32 = end.try_into().expect("could not covert"); + let end: i32 = end.try_into().expect("could not covert"); // internally coitrees uses 0-indexed, right-inclusive "last" self.ranges.query(first, end - 1, visit) } @@ -53,22 +70,88 @@ impl COITreeRangeContainer { pub fn is_empty(&self) -> bool { self.len() == 0 } - } -impl> From> for COITreeRangeContainer { +/// Convert a [`VecRanges`] range container to a [`COITrees`] range container. +impl, M: Clone> From> for COITrees { fn from(value: VecRanges) -> Self { let ranges = BasicCOITree::new(&value.ranges); let length = value.length; - Self { - ranges, - length + Self { ranges, length } + } +} + +/// Convert a [`COITrees`] range container to a [`VecRanges`] range container. +impl From> for VecRanges { + fn from(value: COITrees) -> Self { + let length = value.length; + let mut ranges: VecRanges = VecRanges::new(length); + for interval in value.ranges.iter() { + ranges.push_range(interval.into()); } + ranges } } -impl RangeContainer for COITreeRangeContainer { +/// [`RangeContainer`] trait implementations. +impl RangeContainer for COITrees { fn len(&self) -> usize { self.ranges.len() } } + +/// Convert between [`coitrees::Interval`] with index metadata to a [`RangeEmpty`]. +impl From> for RangeEmpty { + fn from(value: Interval<&()>) -> Self { + RangeEmpty { + start: value.first.try_into().unwrap(), + end: value.last.try_into().unwrap(), + } + } +} + +/// Convert between [`coitrees::Interval`] with index metadata to a [`RangeIndexed`]. +impl From> for RangeIndexed { + fn from(value: Interval<&usize>) -> Self { + RangeIndexed { + start: value.first.try_into().unwrap(), + end: value.last.try_into().unwrap(), + index: *value.metadata(), + } + } +} + + +/// +/// # Developer Notes +/// +/// Internally, the [`coitrees`] iterator is over their interval type. +/// Their iterator does not consume, but we need an owned (or copyable, +/// as is case here) type to map out. Thus, we need this odd variant of +/// an iterator that doesn't return references and does not consume. +impl RangesIterable for COITrees { + fn iter_ranges(&self) -> Box + '_> { + let iter = self.ranges.iter(); + let converted_iter = iter.map(|interval| RangeIndexed::from(interval)); + Box::new(converted_iter) + } +} + +impl RangesIterable for COITrees<()> { + fn iter_ranges(&self) -> Box + '_> { + let iter = self.ranges.iter(); + let converted_iter = iter.map(|interval| RangeEmpty::from(interval)); + Box::new(converted_iter) + } +} + +#[cfg(test)] +mod tests { + use crate::prelude::*; + use crate::test_utilities::{granges_test_case_01, random_coitrees}; + + #[test] + fn test_ranges_iterable_coitrees() { + let gr = granges_test_case_01(); + } +} diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 1e37bcf..f527f40 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -1,64 +1,42 @@ -use std::ops::Range; +//! Range and Range Containers. +//! +//! -use crate::{Position, error::GRangesError}; +use crate::{error::GRangesError, Position}; pub mod coitrees; pub mod vec; - #[derive(Clone, Default)] pub struct RangeEmpty { - range: Range, + pub start: Position, + pub end: Position, } +unsafe impl Sync for RangeEmpty {} +unsafe impl Send for RangeEmpty {} + impl RangeEmpty { /// Create a new 0-indexed right-exclusive range. pub fn new(start: Position, end: Position) -> Self { - let start = start.try_into().unwrap(); - let end = end.try_into().unwrap(); - Self { - range: start..end, - } - } - - pub fn start(&self) -> Position { - self.range.start - } - - pub fn end(&self) -> Position { - self.range.end + Self { start, end } } } #[derive(Clone, Debug, Default)] pub struct RangeIndexed { - range: Range, - index: usize, + pub start: Position, + pub end: Position, + pub index: usize, } +unsafe impl Sync for RangeIndexed {} +unsafe impl Send for RangeIndexed {} + impl RangeIndexed { /// Create a new 0-indexed right-exclusive range. pub fn new(start: Position, end: Position, index: usize) -> Self { - let start = start.try_into().unwrap(); - let end = end.try_into().unwrap(); - Self { - range: start..end, - index - } - } - - pub fn start(&self) -> Position { - self.range.start - } - - pub fn end(&self) -> Position { - self.range.end - } - - // Note: this returning a reference is required to - // implement coitrees's GenericInterval trait. - pub fn index(&self) -> &usize { - &self.index + Self { start, end, index } } } @@ -72,33 +50,49 @@ impl RangeIndexed { /// # Returns /// /// * `bool` - `true` if the range is valid for the sequence; otherwise, `false`. -pub fn validate_range(range: &std::ops::Range, length: Position) -> -Result<(), GRangesError> { - let start = range.start; - let end = range.start; - dbg!(&start); - dbg!(&end); +pub fn validate_range( + start: Position, + end: Position, + length: Position, +) -> Result<(), GRangesError> { if start > end { - GRangesError::InvalidGenomicRange(start, end); + return Err(GRangesError::InvalidGenomicRange(start, end)); } if end >= length { - GRangesError::InvalidGenomicRangeForSequence(start, end, length); + return Err(GRangesError::InvalidGenomicRangeForSequence( + start, end, length, + )); } Ok(()) } #[cfg(test)] mod tests { - use crate::prelude::*; use super::validate_range; + use crate::prelude::*; #[test] fn test_invalid_range_start_end() { - let range = 10..1; - let result = validate_range(&range, 10); - dbg!(&range); - assert!(matches!(result, Err(GRangesError::InvalidGenomicRange(10, 0)))); + let result = validate_range(5, 1, 10); + assert!(matches!( + result, + Err(GRangesError::InvalidGenomicRange(5, 1)) + )); + } + #[test] + fn test_valid_range_length() { + let result = validate_range(1, 10, 11); + assert!(result.is_ok()); + } + + #[test] + fn test_invalid_range_length() { + let result = validate_range(1, 10, 10); + assert!(matches!( + result, + Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) + )); } } diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index efc8026..e31b082 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -1,21 +1,17 @@ -use crate::{traits::RangeContainer, Position, error::GRangesError}; +use crate::{error::GRangesError, traits::RangeContainer, Position}; +use crate::iterators::{RangesIterable, RangesIntoIterable}; +use super::{validate_range, RangeEmpty, RangeIndexed}; -use super::{RangeIndexed, validate_range, RangeEmpty}; pub type VecRangesIndexed = VecRanges; pub type VecRangesEmpty = VecRanges; -#[derive(Clone, Default)] +#[derive(Clone)] pub struct VecRanges { - pub (crate) ranges: Vec, + pub(crate) ranges: Vec, pub length: Position, } impl VecRanges { - pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { - let range = start..end; - validate_range(&range, self.length) - } - /// Create a new empty [`VecRanges`] container. pub fn new(length: Position) -> Self { Self { @@ -24,6 +20,11 @@ impl VecRanges { } } + /// Validate a range, raising an error if it is invalid for some reason. + pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { + validate_range(start, end, self.length) + } + /// Add a new range to the [`VecRanges`] container. pub fn push_range(&mut self, range: R) { self.ranges.push(range) @@ -45,3 +46,11 @@ impl RangeContainer for VecRanges { self.ranges.len() } } + + +impl RangesIntoIterable for VecRanges { + fn into_iter_ranges(self) -> Box> { + let iter = self.ranges.into_iter(); + Box::new(iter) + } +} diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 856e5c2..4b330c0 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -1,9 +1,13 @@ //! Test cases and test utility functions. //! -use indexmap::IndexMap; +use crate::{ + create_granges_with_seqlens, + prelude::{GRanges, VecRangesIndexed}, + ranges::{coitrees::COITrees, vec::VecRanges, RangeEmpty}, + Position, +}; use rand::{thread_rng, Rng}; -use crate::{Position, ranges::{RangeIndexed, vec::VecRanges, RangeEmpty}}; // Stochastic test ranges defaults // @@ -34,8 +38,7 @@ pub fn random_range(chrom_len: Position) -> (Position, Position) { /// Build random sequence lengths pub fn random_seqlen() -> Position { let mut rng = thread_rng(); - let rand_len = rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN); - rand_len + rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN) } /// Sample a random chromosome @@ -56,6 +59,30 @@ pub fn random_vecranges(n: usize) -> VecRanges { vr } +/// Build random [`COITrees`] from a random [`VecRanges`]. +pub fn random_coitrees() -> COITrees<()> { + let vr = random_vecranges(100); + let cr: COITrees<()> = vr.into(); + cr +} - - +/// Range test case #1 +/// +/// Ranges: +/// - chr1: +/// (0, 5, Some(1.1)) +/// (4, 7, Some(8.1)) +/// (10, 17, Some(10.1)) +/// - chr2: +/// (10, 20, Some(3.7)) +/// (18, 32, Some(1.1)) +/// +/// Seqlens: { "chr1" => 30, "chr2" => 100 } +/// +/// Sum of all elements: 24.1 +pub fn granges_test_case_01() -> GRanges> { + create_granges_with_seqlens!(VecRangesIndexed, Vec, { + "chr1" => [(0, 5, 1.1), (4, 7, 8.1), (10, 17, 10.1)], + "chr2" => [(10, 20, 3.7), (18, 32, 1.1)] + }, seqlens: { "chr1" => 30, "chr2" => 100 }) +} diff --git a/src/traits.rs b/src/traits.rs index f473fb0..53087bd 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1,4 +1,3 @@ - pub trait RangeContainer { fn len(&self) -> usize; fn is_empty(&self) -> bool {