From 8de9b47cd2503567d49f8156cb6ae07aab2177d7 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Wed, 14 Feb 2024 17:13:46 -0800 Subject: [PATCH] initial import --- .gitignore | 1 + Cargo.toml | 13 ++++++ src/error.rs | 13 ++++++ src/granges.rs | 96 +++++++++++++++++++++++++++++++++++++ src/join.rs | 2 + src/lib.rs | 13 ++++++ src/ranges/coitrees.rs | 74 +++++++++++++++++++++++++++++ src/ranges/mod.rs | 104 +++++++++++++++++++++++++++++++++++++++++ src/ranges/vec.rs | 47 +++++++++++++++++++ src/test_utilities.rs | 61 ++++++++++++++++++++++++ src/traits.rs | 7 +++ 11 files changed, 431 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 src/error.rs create mode 100644 src/granges.rs create mode 100644 src/join.rs create mode 100644 src/lib.rs create mode 100644 src/ranges/coitrees.rs create mode 100644 src/ranges/mod.rs create mode 100644 src/ranges/vec.rs create mode 100644 src/test_utilities.rs create mode 100644 src/traits.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d69a2d1 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "granges2" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +coitrees = { version = "0.4.0", features = ["nosimd"] } +genomap = "0.1.5" +indexmap = "2.2.3" +rand = "0.8.5" +thiserror = "1.0.57" diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..9f636f1 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,13 @@ +use thiserror::Error; + +use crate::Position; + +#[derive(Debug, Error)] +pub enum GRangesError { + #[error("Range invalid: start ({0}) must be greater than end ({1})")] + InvalidGenomicRange(Position, Position), + + #[error("Range [{0}, {1}] is invalid for sequence of length {2}")] + InvalidGenomicRangeForSequence(Position, Position, Position), + +} diff --git a/src/granges.rs b/src/granges.rs new file mode 100644 index 0000000..13c9365 --- /dev/null +++ b/src/granges.rs @@ -0,0 +1,96 @@ +use genomap::GenomeMap; + +use crate::{traits::RangeContainer, ranges::{vec::{VecRanges, VecRangesIndexed, VecRangesEmpty}, RangeIndexed, RangeEmpty}, Position}; + + +pub struct GRanges { + ranges: GenomeMap, + data: Option, +} + + +impl GRanges +where C: RangeContainer { + + /// Get the total number of ranges. + pub fn len(&self) -> usize { + self.ranges.values().map(|ranges| ranges.len()).sum() + } + + /// Return whether the [`GRanges`] object is empty (contains no ranges). + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl GRanges> { + + /// Create a new [`GRanges`] object, with vector storage for ranges and data. + /// + /// This combination of range and data containers is used when loading data into + /// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's + /// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`] + /// call the [`Vec`]'s shrink to size methods on the range and data containers + /// after data loading to shrink to the minimal necessary size (this can reduce + /// memory usage). + pub fn new_vec() -> Self { + let ranges = GenomeMap::new(); + Self { + ranges, + data: None, + } + } + + + pub fn push_range_with_data(&mut self, seqname: &str, start: Position, end: Position, data: U) { + // push data to the vec data container, getting the index + let index: usize = { + let data_container = self.data.get_or_insert_with(Vec::new); + data_container.push(data); + data_container.len() - 1 // new data index + }; + // push an indexed range + let range = RangeIndexed::new(start, end, index); + self.ranges.entry_or_default(seqname).ranges.push(range); + } +} + +impl GRanges { + + /// Create a new [`GRanges`] object, with vector storage for ranges and no data container. + pub fn new_vec_empty() -> Self { + let ranges = GenomeMap::new(); + Self { + ranges, + data: None, + } + } + + /// Push an empty range (no data) to the [`VecRangesEmpty`] range container. + pub fn push_range_only(&mut self, seqname: &str, start: Position, end: Position) { + // push an unindexed (empty) range + let range = RangeEmpty::new(start, end); + self.ranges.entry_or_default(seqname).ranges.push(range); + } +} + + + +#[cfg(test)] +mod tests { + use crate::{prelude::*, test_utilities::random_vecranges}; + + #[test] + fn test_new_vec() { + let mut gr = GRanges::new_vec(); + gr.push_range_with_data("chr1", 0, 10, 1.1); + assert_eq!(gr.len(), 1); + } + + #[test] + fn test_random_vecranges() { + let vr = random_vecranges(100); + assert_eq!(vr.len(), 100) + } + +} diff --git a/src/join.rs b/src/join.rs new file mode 100644 index 0000000..139597f --- /dev/null +++ b/src/join.rs @@ -0,0 +1,2 @@ + + diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..cb1be7a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,13 @@ + +pub mod traits; +pub mod ranges; +pub mod granges; +pub mod error; +pub mod test_utilities; + +pub type Position = u32; + +pub mod prelude { + pub use crate::granges::GRanges; + pub use crate::error::GRangesError; +} diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs new file mode 100644 index 0000000..2e9d041 --- /dev/null +++ b/src/ranges/coitrees.rs @@ -0,0 +1,74 @@ +use coitrees::{Interval, BasicCOITree, IntervalTree, IntervalNode, GenericInterval}; + +use crate::{Position, traits::RangeContainer, error::GRangesError}; + +use super::{vec::VecRanges, RangeIndexed, validate_range}; + +type COITreeIntervalIndexed = Interval; + +impl GenericInterval for RangeIndexed { + fn first(&self) -> i32 { + self.start().try_into().unwrap() + } + fn last(&self) -> i32 { + self.end().try_into().unwrap() + } + fn metadata(&self) -> &usize { + self.index() + } +} + +/// A [`coitrees::BasicCOITree`] interval tree for a single sequence's ranges. +/// +/// This is generic over the interval type, to handle the case where one +/// may want to do overlap operations on ranges without associated data in +/// a data container (e.g. ranges that just define megabase windwows). +pub struct COITreeRangeContainer { + ranges: BasicCOITree, + /// The sequence length, used to validate new ranges. + length: Position, +} + +impl COITreeRangeContainer { + pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { + let range = start..end; + validate_range(&range, self.length) + } + + pub fn query(&self, start: Position, end: Position, visit: F) + where F: FnMut(&IntervalNode) { + // Note the terminology change to match coitrees (and uses i32s) + let first = start.try_into().expect("could not covert"); + let end: i32 = end.try_into().expect("could not covert"); + // internally coitrees uses 0-indexed, right-inclusive "last" + self.ranges.query(first, end - 1, visit) + } + + /// Return the number of ranges in this [`COITreeRangeContainer`] container. + pub fn len(&self) -> usize { + self.ranges.len() + } + + /// Return whether the [`COITreeRangeContainer`] object is empty (contains no ranges). + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + +} + +impl> From> for COITreeRangeContainer { + fn from(value: VecRanges) -> Self { + let ranges = BasicCOITree::new(&value.ranges); + let length = value.length; + Self { + ranges, + length + } + } +} + +impl RangeContainer for COITreeRangeContainer { + fn len(&self) -> usize { + self.ranges.len() + } +} diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs new file mode 100644 index 0000000..1e37bcf --- /dev/null +++ b/src/ranges/mod.rs @@ -0,0 +1,104 @@ +use std::ops::Range; + +use crate::{Position, error::GRangesError}; + +pub mod coitrees; +pub mod vec; + + +#[derive(Clone, Default)] +pub struct RangeEmpty { + range: Range, +} + +impl RangeEmpty { + /// Create a new 0-indexed right-exclusive range. + pub fn new(start: Position, end: Position) -> Self { + let start = start.try_into().unwrap(); + let end = end.try_into().unwrap(); + Self { + range: start..end, + } + } + + pub fn start(&self) -> Position { + self.range.start + } + + pub fn end(&self) -> Position { + self.range.end + } +} + +#[derive(Clone, Debug, Default)] +pub struct RangeIndexed { + range: Range, + index: usize, +} + +impl RangeIndexed { + /// Create a new 0-indexed right-exclusive range. + pub fn new(start: Position, end: Position, index: usize) -> Self { + let start = start.try_into().unwrap(); + let end = end.try_into().unwrap(); + Self { + range: start..end, + index + } + } + + pub fn start(&self) -> Position { + self.range.start + } + + pub fn end(&self) -> Position { + self.range.end + } + + // Note: this returning a reference is required to + // implement coitrees's GenericInterval trait. + pub fn index(&self) -> &usize { + &self.index + } +} + +/// Validates whether a given range is valid for accessing a sequence of a given `length`. +/// +/// # Arguments +/// +/// * `range` - The range to validate. +/// * `length` - The length of the sequence. +/// +/// # Returns +/// +/// * `bool` - `true` if the range is valid for the sequence; otherwise, `false`. +pub fn validate_range(range: &std::ops::Range, length: Position) -> +Result<(), GRangesError> { + let start = range.start; + let end = range.start; + dbg!(&start); + dbg!(&end); + if start > end { + GRangesError::InvalidGenomicRange(start, end); + } + + if end >= length { + GRangesError::InvalidGenomicRangeForSequence(start, end, length); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::prelude::*; + use super::validate_range; + + #[test] + fn test_invalid_range_start_end() { + let range = 10..1; + let result = validate_range(&range, 10); + dbg!(&range); + assert!(matches!(result, Err(GRangesError::InvalidGenomicRange(10, 0)))); + + } +} diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs new file mode 100644 index 0000000..efc8026 --- /dev/null +++ b/src/ranges/vec.rs @@ -0,0 +1,47 @@ +use crate::{traits::RangeContainer, Position, error::GRangesError}; + +use super::{RangeIndexed, validate_range, RangeEmpty}; +pub type VecRangesIndexed = VecRanges; +pub type VecRangesEmpty = VecRanges; + +#[derive(Clone, Default)] +pub struct VecRanges { + pub (crate) ranges: Vec, + pub length: Position, +} + +impl VecRanges { + pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { + let range = start..end; + validate_range(&range, self.length) + } + + /// Create a new empty [`VecRanges`] container. + pub fn new(length: Position) -> Self { + Self { + ranges: Vec::new(), + length, + } + } + + /// Add a new range to the [`VecRanges`] container. + pub fn push_range(&mut self, range: R) { + self.ranges.push(range) + } + + /// Return the number of ranges in this [`VecRanges`] container. + pub fn len(&self) -> usize { + self.ranges.len() + } + + /// Return whether the [`VecRanges`] object is empty (contains no ranges). + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl RangeContainer for VecRanges { + fn len(&self) -> usize { + self.ranges.len() + } +} diff --git a/src/test_utilities.rs b/src/test_utilities.rs new file mode 100644 index 0000000..856e5c2 --- /dev/null +++ b/src/test_utilities.rs @@ -0,0 +1,61 @@ +//! Test cases and test utility functions. +//! + +use indexmap::IndexMap; +use rand::{thread_rng, Rng}; +use crate::{Position, ranges::{RangeIndexed, vec::VecRanges, RangeEmpty}}; + +// Stochastic test ranges defaults +// +// This is the random number of range to use in tests. +// The tradeoff is catching stochastic errors vs test time. +pub const NRANDOM_RANGES: usize = 10000; + +// range length +pub const MIN_LEN: Position = 1; +pub const MAX_LEN: Position = 10000; + +// number of chromosome sequences +pub const NCHROM: usize = 22; + +// chromosome sizes +pub const MIN_CHROM_LEN: Position = 50_000_000; +pub const MAX_CHROM_LEN: Position = 250_000_000; + +/// Build a random range start/end on a sequence of `max_len`. +/// 0-indexed, right exclusive +pub fn random_range(chrom_len: Position) -> (Position, Position) { + let mut rng = thread_rng(); + let len = rng.gen_range(MIN_LEN..MAX_LEN); + let start = rng.gen_range(0..chrom_len - len + 1); + (start, start + len) +} + +/// Build random sequence lengths +pub fn random_seqlen() -> Position { + let mut rng = thread_rng(); + let rand_len = rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN); + rand_len +} + +/// Sample a random chromosome +pub fn random_chrom() -> String { + let mut rng = thread_rng(); + format!("chr{}", rng.gen_range(1..NCHROM + 1)) +} + +/// Build random [`VecRanges`] +pub fn random_vecranges(n: usize) -> VecRanges { + let seqlen = random_seqlen(); + let mut vr = VecRanges::new(seqlen); + for _i in 0..n { + let (start, end) = random_range(seqlen); + let range = RangeEmpty::new(start, end); + vr.push_range(range); + } + vr +} + + + + diff --git a/src/traits.rs b/src/traits.rs new file mode 100644 index 0000000..f473fb0 --- /dev/null +++ b/src/traits.rs @@ -0,0 +1,7 @@ + +pub trait RangeContainer { + fn len(&self) -> usize; + fn is_empty(&self) -> bool { + self.len() == 0 + } +}