Skip to content

Commit

Permalink
Brought Sequences over, IndexedDataContainer no longer has lifeti…
Browse files Browse the repository at this point in the history
…me, more.

 - Renamed multiple things for accuracy, consistency.
 - Removed generic lifetime in `IndexedDataContainer` trait, which
   greatly simplifies things.
 - New `Operation` enum for bedtools map-like operations.
 - `float_compute()` function for doing operations (this will likely change)
 - Brough sequences stuff over.
 - `apply_over_join()` and `apply_into_vec()` (both names likely to change).
 - New sequences doc and test.
  • Loading branch information
vsbuffalo committed Feb 24, 2024
1 parent 2b8aa5a commit d84ec3e
Show file tree
Hide file tree
Showing 24 changed files with 1,609 additions and 45 deletions.
9 changes: 6 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,22 @@ coitrees = { version = "0.4.0", features = ["nosimd"] }
flate2 = "1.0.28"
genomap = "0.2.6"
indexmap = "2.2.3"
ndarray = "0.15.6"
noodles = { version = "0.63.0", features = ["core", "bed"] }
ndarray = { version = "0.15.6", optional = true}
noodles = { version = "0.63.0", features = ["core", "bed", "fasta"] }
rand = "0.8.5"
tempfile = "3.10.0"
thiserror = "1.0.57"
polars = { version = "0.37.0", optional = true }
bytes = "1.5.0"
ndarray-npy = { version = "0.8.1", optional = true }
num-traits = "0.2.18"

[features]
# cli = [ "clap" ] # TODO make feature
dev-commands = [ ]
test_bigranges = [] # NOTE: not used yet, for tests on large files
polars = ["dep:polars"]

ndarray = ["dep:ndarray", "dep:ndarray-npy"]

[profile.release]
opt-level = 3
Expand Down
10 changes: 5 additions & 5 deletions benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
//! ensure the output is the *exact* same.

use criterion::{criterion_group, criterion_main, Criterion};
use granges::test_utilities::{granges_binary_path, random_bedfile};
use granges::test_utilities::{granges_binary_path, random_bed3file};
use std::process::Command;

const BED_LENGTH: usize = 100_000;
Expand All @@ -15,7 +15,7 @@ fn bench_range_adjustment(c: &mut Criterion) {
let mut group = c.benchmark_group("adjust");

// create the test data
let input_bedfile = random_bedfile(BED_LENGTH);
let input_bedfile = random_bed3file(BED_LENGTH);

// configure the sample size for the group
// group.sample_size(10);
Expand Down Expand Up @@ -59,8 +59,8 @@ fn bench_filter_adjustment(c: &mut Criterion) {
let mut group = c.benchmark_group("filter");

// create the test data
let random_bedfile_left_tempfile = random_bedfile(BED_LENGTH);
let random_bedfile_right_tempfile = random_bedfile(BED_LENGTH);
let random_bedfile_left_tempfile = random_bed3file(BED_LENGTH);
let random_bedfile_right_tempfile = random_bed3file(BED_LENGTH);
let random_bedfile_left = random_bedfile_left_tempfile.path();
let random_bedfile_right = random_bedfile_right_tempfile.path();

Expand Down Expand Up @@ -104,7 +104,7 @@ fn bench_flank(c: &mut Criterion) {
let mut group = c.benchmark_group("flank");

// create the test data
let random_bedfile_tempfile = random_bedfile(BED_LENGTH);
let random_bedfile_tempfile = random_bed3file(BED_LENGTH);
let random_bedfile = random_bedfile_tempfile.path();

// configure the sample size for the group
Expand Down
2 changes: 2 additions & 0 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use std::path::PathBuf;

use crate::{
data::operations::Operation,
io::{parsers::GenomicRangesParser, OutputStream},
prelude::*,
ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord},
Expand Down Expand Up @@ -401,6 +402,7 @@ pub fn granges_map(
seqlens: impl Into<PathBuf>,
left_path: &PathBuf,
right_path: &PathBuf,
operations: Vec<Operation>,
output: Option<&PathBuf>,
skip_missing: bool,
) -> Result<CommandOutput<()>, GRangesError> {
Expand Down
1 change: 1 addition & 0 deletions src/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

use crate::traits::DataContainer;

pub mod operations;
pub mod vec;

impl<U> DataContainer for Vec<U> {}
Expand Down
11 changes: 11 additions & 0 deletions src/data/ndarray.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,18 @@ where
U: Copy + Default + 'a,
{
type Item = U;
type OwnedItem = U;
type Output = Array1<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
self[index]
}

fn get_owned(&'a self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
// already owned
self[index]
}

fn len(&self) -> usize {
self.len()
}
Expand All @@ -32,12 +38,17 @@ where
U: Copy + Default + 'a,
{
type Item = ArrayView1<'a, U>;
type OwnedItem = Array1<U>;
type Output = Array2<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
self.row(index)
}

fn get_owned(&'a self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
self.row(index).to_owned()
}

fn len(&self) -> usize {
self.shape()[0]
}
Expand Down
69 changes: 69 additions & 0 deletions src/data/operations.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
//! Implementations of various operations on data.
//!

use num_traits::{Float, ToPrimitive};
use std::iter::Sum;

/// Calculate the median.
///
/// This will clone and turn `numbers` into a `Vec`.
pub fn median<F: Float + Ord + Sum>(numbers: &[F]) -> F {
let mut numbers = numbers.to_vec();
numbers.sort_unstable();
let mid = numbers.len() / 2;
if numbers.len() % 2 == 0 {
(numbers[mid - 1] + numbers[mid]) / F::from(2.0).unwrap()
} else {
numbers[mid]
}
}

/// The (subset of) standard `bedtools map` operations.
pub enum Operation {
Sum,
Min,
Max,
Mean,
Median,
Collapse,
}

pub enum OperationResult<T>
where
T: Float,
{
Float(T),
String(String),
}

pub fn float_compute<T>(operation: Operation, data: &[T]) -> Option<OperationResult<T>>
where
T: Float + Sum<T> + ToPrimitive + Ord + Clone + ToString,
{
match operation {
Operation::Sum => {
let sum: T = data.iter().cloned().sum();
Some(OperationResult::Float(sum))
}
Operation::Min => data.iter().cloned().min().map(OperationResult::Float),
Operation::Max => data.iter().cloned().max().map(OperationResult::Float),
Operation::Mean => {
if data.is_empty() {
None
} else {
let sum: T = data.iter().cloned().sum();
let mean = sum / T::from(data.len()).unwrap();
Some(OperationResult::Float(mean))
}
}
Operation::Median => Some(OperationResult::Float(median(data))),
Operation::Collapse => {
let collapsed = data
.iter()
.map(|num| num.to_string())
.collect::<Vec<_>>()
.join(", ");
Some(OperationResult::String(collapsed))
}
}
}
13 changes: 9 additions & 4 deletions src/data/vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,22 @@ use crate::traits::IndexedDataContainer;
/// Trait methods for the commonly-used `Vec<U>` data container.
///
/// Note that the associated `Item` type is always a *reference* to the data elements.
impl<'a, U> IndexedDataContainer<'a> for Vec<U>
impl<U> IndexedDataContainer for Vec<U>
where
U: Clone + 'a,
U: Clone,
{
type Item = &'a U;
type Item<'a> = &'a U where Self: 'a;
type OwnedItem = U;
type Output = Vec<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
fn get_value(&self, index: usize) -> Self::Item<'_> {
self.get(index).unwrap()
}

fn get_owned(&self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
self.get(index).unwrap().to_owned()
}

fn len(&self) -> usize {
self.len()
}
Expand Down
13 changes: 12 additions & 1 deletion src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
//!
use crate::Position;
use genomap::GenomeMapError;
use std::num::{ParseFloatError, ParseIntError};
use std::{
num::{ParseFloatError, ParseIntError},
string::FromUtf8Error,
};
use thiserror::Error;

/// The [`GRangesError`] defines the standard set of errors that should
Expand Down Expand Up @@ -48,6 +51,14 @@ pub enum GRangesError {
#[error("Invalid GRanges object: no data container.")]
NoDataContainer,

// Sequences related errors
#[error("Sequence name '{0}' was not found.")]
MissingSequenceName(String),

// FASTA/noodles related errors
#[error("Error encountered in trying to convert bytes to UTF8 string.")]
FromUtf8Error(#[from] FromUtf8Error),

// Command line tool related errors
#[error("Unsupported genomic range format")]
UnsupportedGenomicRangesFileFormat,
Expand Down
63 changes: 54 additions & 9 deletions src/granges.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! The [`GRanges<R, T>`] and [`GRangesEmpty`] types, and associated functionality.
//!
//!
//!```text
//! +----------------------+
//! | GRanges<R, T> |
//! | object |
Expand All @@ -14,6 +14,7 @@
//! | container | | container |
//! | (R) | | (T) |
//! +-----------+ +-------------+
//!```
//!
//! # [`GRanges<R, T>`] Generic Types
//!
Expand Down Expand Up @@ -87,7 +88,7 @@ use crate::{
ensure_eq,
io::OutputStream,
iterators::GRangesIterator,
join::{JoinData, LeftGroupedJoin},
join::{CombinedJoinData, JoinData, LeftGroupedJoin},
prelude::GRangesError,
ranges::{
coitrees::{COITrees, COITreesEmpty, COITreesIndexed},
Expand Down Expand Up @@ -205,13 +206,18 @@ where
.collect();
seqlens
}

/// Take the data out of this [`GRanges`] object.
pub fn take_data(&mut self) -> Result<T, GRangesError> {
std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)
}
}

impl<'a, T> GenomicRangesTsvSerialize<'a, VecRangesIndexed> for GRanges<VecRangesIndexed, T>
where
T: IndexedDataContainer<'a>,
T: IndexedDataContainer + 'a,
T: TsvSerialize,
<T as IndexedDataContainer<'a>>::Item: TsvSerialize,
<T as IndexedDataContainer>::Item<'a>: TsvSerialize,
{
/// Write
fn to_tsv(&'a self, output: Option<impl Into<PathBuf>>) -> Result<(), GRangesError> {
Expand Down Expand Up @@ -547,9 +553,9 @@ impl<'a, DL, DR> GRanges<VecRangesIndexed, JoinData<'a, DL, DR>> {

impl<'a, T> GRanges<VecRanges<RangeIndexed>, T>
where
T: IndexedDataContainer<'a>,
T: IndexedDataContainer + 'a,
T: TsvSerialize,
<T as IndexedDataContainer<'a>>::Item: TsvSerialize,
<T as IndexedDataContainer>::Item<'a>: TsvSerialize,
{
/// Write this [`GRanges<VecRanges, T>`] object to a TSV file, using the
/// [`TsvSerialize`] trait methods defiend for the items in `T`.
Expand Down Expand Up @@ -606,17 +612,56 @@ where
}
}

impl<'a, DL: Clone + 'a, DR: Clone + 'a> GRanges<VecRangesIndexed, JoinData<'a, DL, DR>>
where
DL: IndexedDataContainer,
DR: IndexedDataContainer,
{
/// Apply a function over the [`JoinData`] inside this [`GRanges`].
///
/// This is a workhorse method that is used to summarize genomic overlaps. The
/// user-specified function, `func` is applied to each "join" item in this [`GRanges`]
/// object's data container (which is a [`JoinData`] storing the join information).
/// This supplied `func` function returns some generic type `V` per join, which could be
/// e.g. a median `f64` value, a `String` of all overlap right ranges' values concatenated,
/// etc.
///
/// See [`CombinedJoinData`] and its convenience methods, which are designed
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn apply_over_join<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
where
F: Fn(
CombinedJoinData<
<DL as IndexedDataContainer>::OwnedItem,
<DR as IndexedDataContainer>::OwnedItem,
>,
) -> V,
{
let data = self.take_data()?;
let transformed_data: Vec<V> = data.apply_into_vec(func);
let ranges = self.ranges;
Ok(GRanges {
ranges,
data: Some(transformed_data),
})
}
}

impl<'a, C, T> GRanges<C, T>
where
T: IndexedDataContainer<'a>,
T: IndexedDataContainer,
{
/// Get the data in the data container at specified index.
///
/// # Panics
/// This will panic if there if the index is invalid, or the
/// data container is `None`. Both of these indicate internal
/// design errors: please file an issue of you encounter a panic.
pub fn get_data_value(&'a self, index: usize) -> <T as IndexedDataContainer>::Item {
pub fn get_data_value(&self, index: usize) -> <T as IndexedDataContainer>::Item<'_> {
self.data
.as_ref()
.expect("data container was None")
Expand Down Expand Up @@ -839,7 +884,7 @@ where
let mut gr: GRanges<VecRangesIndexed, Vec<U>> = GRanges::new_vec(&self.seqlens());

let right_ref = right.as_granges_ref();
let data = std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)?;
let data = self.take_data()?;

let mut old_indices = HashSet::new(); // the old indices to *keep*
let mut new_indices = Vec::new();
Expand Down
Loading

0 comments on commit d84ec3e

Please sign in to comment.