Skip to content

Commit

Permalink
initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
vsbuffalo committed Feb 15, 2024
0 parents commit 8de9b47
Show file tree
Hide file tree
Showing 11 changed files with 431 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/target
13 changes: 13 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "granges2"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
coitrees = { version = "0.4.0", features = ["nosimd"] }
genomap = "0.1.5"
indexmap = "2.2.3"
rand = "0.8.5"
thiserror = "1.0.57"
13 changes: 13 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
use thiserror::Error;

use crate::Position;

#[derive(Debug, Error)]
pub enum GRangesError {
#[error("Range invalid: start ({0}) must be greater than end ({1})")]
InvalidGenomicRange(Position, Position),

#[error("Range [{0}, {1}] is invalid for sequence of length {2}")]
InvalidGenomicRangeForSequence(Position, Position, Position),

}
96 changes: 96 additions & 0 deletions src/granges.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
use genomap::GenomeMap;

use crate::{traits::RangeContainer, ranges::{vec::{VecRanges, VecRangesIndexed, VecRangesEmpty}, RangeIndexed, RangeEmpty}, Position};


pub struct GRanges<C, T> {
ranges: GenomeMap<C>,
data: Option<T>,
}


impl<C, T> GRanges<C, T>
where C: RangeContainer {

/// Get the total number of ranges.
pub fn len(&self) -> usize {
self.ranges.values().map(|ranges| ranges.len()).sum()
}

/// Return whether the [`GRanges`] object is empty (contains no ranges).
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}

impl<U> GRanges<VecRangesIndexed, Vec<U>> {

/// Create a new [`GRanges`] object, with vector storage for ranges and data.
///
/// This combination of range and data containers is used when loading data into
/// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's
/// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`]
/// call the [`Vec`]'s shrink to size methods on the range and data containers
/// after data loading to shrink to the minimal necessary size (this can reduce
/// memory usage).
pub fn new_vec() -> Self {
let ranges = GenomeMap::new();
Self {
ranges,
data: None,
}
}


pub fn push_range_with_data(&mut self, seqname: &str, start: Position, end: Position, data: U) {
// push data to the vec data container, getting the index
let index: usize = {
let data_container = self.data.get_or_insert_with(Vec::new);
data_container.push(data);
data_container.len() - 1 // new data index
};
// push an indexed range
let range = RangeIndexed::new(start, end, index);
self.ranges.entry_or_default(seqname).ranges.push(range);
}
}

impl GRanges<VecRangesEmpty, ()> {

/// Create a new [`GRanges`] object, with vector storage for ranges and no data container.
pub fn new_vec_empty() -> Self {
let ranges = GenomeMap::new();
Self {
ranges,
data: None,
}
}

/// Push an empty range (no data) to the [`VecRangesEmpty`] range container.
pub fn push_range_only(&mut self, seqname: &str, start: Position, end: Position) {
// push an unindexed (empty) range
let range = RangeEmpty::new(start, end);
self.ranges.entry_or_default(seqname).ranges.push(range);
}
}



#[cfg(test)]
mod tests {
use crate::{prelude::*, test_utilities::random_vecranges};

#[test]
fn test_new_vec() {
let mut gr = GRanges::new_vec();
gr.push_range_with_data("chr1", 0, 10, 1.1);
assert_eq!(gr.len(), 1);
}

#[test]
fn test_random_vecranges() {
let vr = random_vecranges(100);
assert_eq!(vr.len(), 100)
}

}
2 changes: 2 additions & 0 deletions src/join.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@


13 changes: 13 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

pub mod traits;
pub mod ranges;
pub mod granges;
pub mod error;
pub mod test_utilities;

pub type Position = u32;

pub mod prelude {
pub use crate::granges::GRanges;
pub use crate::error::GRangesError;
}
74 changes: 74 additions & 0 deletions src/ranges/coitrees.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use coitrees::{Interval, BasicCOITree, IntervalTree, IntervalNode, GenericInterval};

use crate::{Position, traits::RangeContainer, error::GRangesError};

use super::{vec::VecRanges, RangeIndexed, validate_range};

type COITreeIntervalIndexed = Interval<usize>;

impl GenericInterval<usize> for RangeIndexed {
fn first(&self) -> i32 {
self.start().try_into().unwrap()
}
fn last(&self) -> i32 {
self.end().try_into().unwrap()
}
fn metadata(&self) -> &usize {
self.index()
}
}

/// A [`coitrees::BasicCOITree`] interval tree for a single sequence's ranges.
///
/// This is generic over the interval type, to handle the case where one
/// may want to do overlap operations on ranges without associated data in
/// a data container (e.g. ranges that just define megabase windwows).
pub struct COITreeRangeContainer<R: Clone> {
ranges: BasicCOITree<R, usize>,
/// The sequence length, used to validate new ranges.
length: Position,
}

impl<R: Clone> COITreeRangeContainer<R> {
pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> {
let range = start..end;
validate_range(&range, self.length)
}

pub fn query<F>(&self, start: Position, end: Position, visit: F)
where F: FnMut(&IntervalNode<R, usize>) {
// Note the terminology change to match coitrees (and uses i32s)
let first = start.try_into().expect("could not covert");
let end: i32 = end.try_into().expect("could not covert");
// internally coitrees uses 0-indexed, right-inclusive "last"
self.ranges.query(first, end - 1, visit)
}

/// Return the number of ranges in this [`COITreeRangeContainer`] container.
pub fn len(&self) -> usize {
self.ranges.len()
}

/// Return whether the [`COITreeRangeContainer`] object is empty (contains no ranges).
pub fn is_empty(&self) -> bool {
self.len() == 0
}

}

impl<R: Clone + GenericInterval<R>> From<VecRanges<R>> for COITreeRangeContainer<R> {
fn from(value: VecRanges<R>) -> Self {
let ranges = BasicCOITree::new(&value.ranges);
let length = value.length;
Self {
ranges,
length
}
}
}

impl<R: Clone> RangeContainer for COITreeRangeContainer<R> {
fn len(&self) -> usize {
self.ranges.len()
}
}
104 changes: 104 additions & 0 deletions src/ranges/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
use std::ops::Range;

use crate::{Position, error::GRangesError};

pub mod coitrees;
pub mod vec;


#[derive(Clone, Default)]
pub struct RangeEmpty {
range: Range<Position>,
}

impl RangeEmpty {
/// Create a new 0-indexed right-exclusive range.
pub fn new(start: Position, end: Position) -> Self {
let start = start.try_into().unwrap();
let end = end.try_into().unwrap();
Self {
range: start..end,
}
}

pub fn start(&self) -> Position {
self.range.start
}

pub fn end(&self) -> Position {
self.range.end
}
}

#[derive(Clone, Debug, Default)]
pub struct RangeIndexed {
range: Range<Position>,
index: usize,
}

impl RangeIndexed {
/// Create a new 0-indexed right-exclusive range.
pub fn new(start: Position, end: Position, index: usize) -> Self {
let start = start.try_into().unwrap();
let end = end.try_into().unwrap();
Self {
range: start..end,
index
}
}

pub fn start(&self) -> Position {
self.range.start
}

pub fn end(&self) -> Position {
self.range.end
}

// Note: this returning a reference is required to
// implement coitrees's GenericInterval trait.
pub fn index(&self) -> &usize {
&self.index
}
}

/// Validates whether a given range is valid for accessing a sequence of a given `length`.
///
/// # Arguments
///
/// * `range` - The range to validate.
/// * `length` - The length of the sequence.
///
/// # Returns
///
/// * `bool` - `true` if the range is valid for the sequence; otherwise, `false`.
pub fn validate_range(range: &std::ops::Range<Position>, length: Position) ->
Result<(), GRangesError> {
let start = range.start;
let end = range.start;
dbg!(&start);
dbg!(&end);
if start > end {
GRangesError::InvalidGenomicRange(start, end);
}

if end >= length {
GRangesError::InvalidGenomicRangeForSequence(start, end, length);
}
Ok(())
}

#[cfg(test)]
mod tests {
use crate::prelude::*;
use super::validate_range;

#[test]
fn test_invalid_range_start_end() {
let range = 10..1;
let result = validate_range(&range, 10);
dbg!(&range);
assert!(matches!(result, Err(GRangesError::InvalidGenomicRange(10, 0))));

}
}
47 changes: 47 additions & 0 deletions src/ranges/vec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
use crate::{traits::RangeContainer, Position, error::GRangesError};

use super::{RangeIndexed, validate_range, RangeEmpty};
pub type VecRangesIndexed = VecRanges<RangeIndexed>;
pub type VecRangesEmpty = VecRanges<RangeEmpty>;

#[derive(Clone, Default)]
pub struct VecRanges<R: Clone> {
pub (crate) ranges: Vec<R>,
pub length: Position,
}

impl<R: Clone> VecRanges<R> {
pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> {
let range = start..end;
validate_range(&range, self.length)
}

/// Create a new empty [`VecRanges`] container.
pub fn new(length: Position) -> Self {
Self {
ranges: Vec::new(),
length,
}
}

/// Add a new range to the [`VecRanges`] container.
pub fn push_range(&mut self, range: R) {
self.ranges.push(range)
}

/// Return the number of ranges in this [`VecRanges`] container.
pub fn len(&self) -> usize {
self.ranges.len()
}

/// Return whether the [`VecRanges`] object is empty (contains no ranges).
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}

impl<R: Clone> RangeContainer for VecRanges<R> {
fn len(&self) -> usize {
self.ranges.len()
}
}
Loading

0 comments on commit 8de9b47

Please sign in to comment.