Skip to content

Commit

Permalink
Basic iterator trait, range pushing methods, to coitrees, and more!
Browse files Browse the repository at this point in the history
 - tests and test utilies
 - rough join stuff
 - interval/range conversion methods
 - clippy & fmt
 - GitHub Rust workflow added
  • Loading branch information
vsbuffalo committed Feb 15, 2024
1 parent 8de9b47 commit 1b5e236
Show file tree
Hide file tree
Showing 12 changed files with 409 additions and 127 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Rust

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

env:
CARGO_TERM_COLOR: always

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Build
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
[package]
name = "granges2"
name = "granges"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
coitrees = { version = "0.4.0", features = ["nosimd"] }
genomap = "0.1.5"
genomap = "0.2.6"
indexmap = "2.2.3"
rand = "0.8.5"
thiserror = "1.0.57"
6 changes: 6 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use genomap::GenomeMapError;
use thiserror::Error;

use crate::Position;
Expand All @@ -10,4 +11,9 @@ pub enum GRangesError {
#[error("Range [{0}, {1}] is invalid for sequence of length {2}")]
InvalidGenomicRangeForSequence(Position, Position, Position),

#[error("Sequence name '{0}' is not the ranges container")]
MissingSequence(String),

#[error("Error encountered in genomap::GenomeMap")]
GenomeMapError(#[from] GenomeMapError),
}
117 changes: 88 additions & 29 deletions src/granges.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
use genomap::GenomeMap;

use crate::{traits::RangeContainer, ranges::{vec::{VecRanges, VecRangesIndexed, VecRangesEmpty}, RangeIndexed, RangeEmpty}, Position};


use indexmap::IndexMap;

use crate::{
prelude::GRangesError,
ranges::{
coitrees::{COITrees, COITreesIndexed},
vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
RangeEmpty, RangeIndexed,
},
traits::RangeContainer,
Position,
};

#[derive(Clone)]
pub struct GRanges<C, T> {
ranges: GenomeMap<C>,
data: Option<T>,
}


impl<C, T> GRanges<C, T>
where C: RangeContainer {

where
C: RangeContainer,
{
/// Get the total number of ranges.
pub fn len(&self) -> usize {
self.ranges.values().map(|ranges| ranges.len()).sum()
Expand All @@ -24,25 +34,35 @@ where C: RangeContainer {
}

impl<U> GRanges<VecRangesIndexed, Vec<U>> {

/// Create a new [`GRanges`] object, with vector storage for ranges and data.
///
/// This combination of range and data containers is used when loading data into
/// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's
/// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`]
/// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's
/// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`]
/// call the [`Vec`]'s shrink to size methods on the range and data containers
/// after data loading to shrink to the minimal necessary size (this can reduce
/// memory usage).
pub fn new_vec() -> Self {
let ranges = GenomeMap::new();
Self {
ranges,
data: None,
pub fn new_vec(seqlens: &IndexMap<String, Position>) -> Self {
let mut ranges = GenomeMap::new();
for (seqname, length) in seqlens.iter() {
// this should never happen because the error is only if
// insert encounters a seqname that's already been inserted -- that
// cannot happen here.
ranges
.insert(seqname, VecRanges::new(*length))
.expect("Internal error: please report");
}
Self { ranges, data: None }
}


pub fn push_range_with_data(&mut self, seqname: &str, start: Position, end: Position, data: U) {
/// Push a genomic range with its data to the range and data containers in a [`GRanges] object.
pub fn push_range_with_data(
&mut self,
seqname: &str,
start: Position,
end: Position,
data: U,
) -> Result<(), GRangesError> {
// push data to the vec data container, getting the index
let index: usize = {
let data_container = self.data.get_or_insert_with(Vec::new);
Expand All @@ -51,39 +71,72 @@ impl<U> GRanges<VecRangesIndexed, Vec<U>> {
};
// push an indexed range
let range = RangeIndexed::new(start, end, index);
self.ranges.entry_or_default(seqname).ranges.push(range);
let range_container = self
.ranges
.get_mut(seqname)
.ok_or(GRangesError::MissingSequence(seqname.to_string()))?;
range_container.push_range(range);
Ok(())
}
}

impl GRanges<VecRangesEmpty, ()> {

/// Create a new [`GRanges`] object, with vector storage for ranges and no data container.
pub fn new_vec_empty() -> Self {
let ranges = GenomeMap::new();
Self {
ranges,
data: None,
}
Self { ranges, data: None }
}

/// Push an empty range (no data) to the [`VecRangesEmpty`] range container.
pub fn push_range_only(&mut self, seqname: &str, start: Position, end: Position) {
pub fn push_range_only(
&mut self,
seqname: &str,
start: Position,
end: Position,
) -> Result<(), GRangesError> {
// push an unindexed (empty) range
let range = RangeEmpty::new(start, end);
self.ranges.entry_or_default(seqname).ranges.push(range);
let range_container = self
.ranges
.get_mut(seqname)
.ok_or(GRangesError::MissingSequence(seqname.to_string()))?;
range_container.push_range(range);
Ok(())
}
}


impl<T> GRanges<VecRangesIndexed, T> {
/// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree
/// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library
/// by Daniel C. Jones.
pub fn to_coitrees(self) -> Result<GRanges<COITreesIndexed, T>, GRangesError> {
let old_ranges = self.ranges;
let mut new_ranges = GenomeMap::new();
for (seqname, vec_ranges) in old_ranges.into_iter() {
let trees = COITrees::from(vec_ranges);
new_ranges.insert(&seqname, trees)?;
}
Ok(GRanges {
ranges: new_ranges,
data: self.data,
})
}
}

#[cfg(test)]
mod tests {
use crate::{prelude::*, test_utilities::random_vecranges};
use indexmap::indexmap;

use crate::{
prelude::*,
test_utilities::{granges_test_case_01, random_vecranges},
};

#[test]
fn test_new_vec() {
let mut gr = GRanges::new_vec();
gr.push_range_with_data("chr1", 0, 10, 1.1);
let seqlens = indexmap! { "chr1".to_string() => 10};
let mut gr = GRanges::new_vec(&seqlens);
gr.push_range_with_data("chr1", 0, 10, 1.1).unwrap();
assert_eq!(gr.len(), 1);
}

Expand All @@ -93,4 +146,10 @@ mod tests {
assert_eq!(vr.len(), 100)
}

#[test]
fn test_to_coitrees() {
let gr_vec = granges_test_case_01();
let gr = gr_vec.clone().to_coitrees().unwrap();
assert_eq!(gr.len(), 5);
}
}
11 changes: 11 additions & 0 deletions src/iterators.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/// the [`rangesiterable`] trait defines common functionality for iterating over
/// the copyable range types.
pub trait RangesIterable<R: Clone> {
fn iter_ranges(&self) -> Box<dyn Iterator<Item = R> + '_>;
}

pub trait RangesIntoIterable<R: Clone> {
fn into_iter_ranges(self) -> Box<dyn Iterator<Item = R>>;
}


43 changes: 43 additions & 0 deletions src/join.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,45 @@
use std::rc::Rc;

use crate::Position;

pub struct JoinData<DL, DR> {
/// The data index for the left range.
left: usize,

/// A `Vec` of the indices for the overlapping right ranges.
rights: Vec<usize>,

/// The length of the left range.
left_length: Position,

/// The lengths of the right ranges.
right_lengths: Vec<Position>,

/// The lengths of the overlaps between the left and right ranges.
overlaps: Vec<Position>,

// TODO: we may want some simple summary of whether something is
// up or downstream. I think the cleanest summary is a signed integer
// representing the side and degree of non-overlap. E.g. a range
// that overlaps another but overhangs the 3' side of the focal left
// range by 10bp is +10; if it were 5', it would be -10.
/// A possible reference to the left data of type `T`.
left_data: Option<Rc<DL>>,

/// A possible reference to the right data of type `T`.
right_data: Option<Rc<DR>>,
}

pub struct JoinIterator<DL, DR> {
left_data: Option<Rc<DL>>,
right_data: Option<Rc<DR>>,
}
//
// impl<DL, DR> JoinIterator<DL, DR> {
//
// }
//
//
// pub fn left_join<CL, CR, DL, DR>(left: GRanges<CL, DL>, right: GRanges<CR, DR>) -> JoinIterator<DL, DR> {
//
// }
37 changes: 33 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,42 @@
// Copyright (2024) Vince Buffalo
#![crate_name = "granges"]
#![doc(html_root_url = "https://docs.rs/granges/")]

pub mod traits;
pub mod ranges;
pub mod granges;
pub mod error;
pub mod granges;
pub mod iterators;
pub mod join;
pub mod ranges;
pub mod test_utilities;
pub mod traits;

pub type Position = u32;

pub mod prelude {
pub use crate::granges::GRanges;
pub use crate::error::GRangesError;
pub use crate::granges::GRanges;
pub use crate::iterators::RangesIterable;

pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
}

/// Create a new `GRanges<T>` with sequence length information (used primarily for small examples)
#[macro_export]
macro_rules! create_granges_with_seqlens {
($range_type:ty, $data_type:ty, { $($chr:expr => [$(($start:expr, $end:expr, $data:expr)),*]),* }, seqlens: { $($chr_len:expr => $len:expr),* }) => {
{
let mut seqlens = ::indexmap::IndexMap::new();
$(seqlens.insert($chr_len.to_string(), $len);)*

let mut gr: GRanges<$range_type, $data_type> = GRanges::new_vec(&seqlens);

$(
$(
gr.push_range_with_data(&$chr.to_string(), $start, $end, $data).expect("Failed to push range");
)*
)*

gr
}
};
}
Loading

0 comments on commit 1b5e236

Please sign in to comment.