Brought Sequences over, IndexedDataContainer no longer has lifeti…

…me, more. - Renamed multiple things for accuracy, consistency. - Removed generic lifetime in `IndexedDataContainer` trait, which greatly simplifies things. - New `Operation` enum for bedtools map-like operations. - `float_compute()` function for doing operations (this will likely change) - Brough sequences stuff over. - `apply_over_join()` and `apply_into_vec()` (both names likely to change). - New sequences doc and test.
vsbuffalo · Feb 24, 2024 · d84ec3e · d84ec3e
1 parent 2b8aa5a
commit d84ec3e
Show file tree

Hide file tree

Showing 24 changed files with 1,609 additions and 45 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,19 +17,22 @@ coitrees = { version = "0.4.0", features = ["nosimd"] }
 flate2 = "1.0.28"
 genomap = "0.2.6"
 indexmap = "2.2.3"
-ndarray = "0.15.6"
-noodles = { version = "0.63.0", features = ["core", "bed"] }
+ndarray = { version = "0.15.6", optional = true}
+noodles = { version = "0.63.0", features = ["core", "bed", "fasta"] }
 rand = "0.8.5"
 tempfile = "3.10.0"
 thiserror = "1.0.57"
 polars = { version = "0.37.0", optional = true }
+bytes = "1.5.0"
+ndarray-npy = { version = "0.8.1", optional = true }
+num-traits = "0.2.18"
 
 [features]
 # cli = [ "clap" ] # TODO make feature
 dev-commands = [ ]
 test_bigranges = [] # NOTE: not used yet, for tests on large files
 polars = ["dep:polars"]
-
+ndarray = ["dep:ndarray", "dep:ndarray-npy"]
 
 [profile.release]
 opt-level = 3

diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs
@@ -5,7 +5,7 @@
 //! ensure the output is the *exact* same.
 
 use criterion::{criterion_group, criterion_main, Criterion};
-use granges::test_utilities::{granges_binary_path, random_bedfile};
+use granges::test_utilities::{granges_binary_path, random_bed3file};
 use std::process::Command;
 
 const BED_LENGTH: usize = 100_000;
@@ -15,7 +15,7 @@ fn bench_range_adjustment(c: &mut Criterion) {
  let mut group = c.benchmark_group("adjust");
 
  // create the test data
- let input_bedfile = random_bedfile(BED_LENGTH);
+ let input_bedfile = random_bed3file(BED_LENGTH);
 
  // configure the sample size for the group
  // group.sample_size(10);
@@ -59,8 +59,8 @@ fn bench_filter_adjustment(c: &mut Criterion) {
  let mut group = c.benchmark_group("filter");
 
  // create the test data
- let random_bedfile_left_tempfile = random_bedfile(BED_LENGTH);
- let random_bedfile_right_tempfile = random_bedfile(BED_LENGTH);
+ let random_bedfile_left_tempfile = random_bed3file(BED_LENGTH);
+ let random_bedfile_right_tempfile = random_bed3file(BED_LENGTH);
  let random_bedfile_left = random_bedfile_left_tempfile.path();
  let random_bedfile_right = random_bedfile_right_tempfile.path();
 
@@ -104,7 +104,7 @@ fn bench_flank(c: &mut Criterion) {
  let mut group = c.benchmark_group("flank");
 
  // create the test data
- let random_bedfile_tempfile = random_bedfile(BED_LENGTH);
+ let random_bedfile_tempfile = random_bed3file(BED_LENGTH);
  let random_bedfile = random_bedfile_tempfile.path();
 
  // configure the sample size for the group

diff --git a/src/commands.rs b/src/commands.rs
@@ -3,6 +3,7 @@
 use std::path::PathBuf;
 
 use crate::{
+ data::operations::Operation,
  io::{parsers::GenomicRangesParser, OutputStream},
  prelude::*,
  ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord},
@@ -401,6 +402,7 @@ pub fn granges_map(
  seqlens: impl Into<PathBuf>,
  left_path: &PathBuf,
  right_path: &PathBuf,
+ operations: Vec<Operation>,
  output: Option<&PathBuf>,
  skip_missing: bool,
 ) -> Result<CommandOutput<()>, GRangesError> {

diff --git a/src/data/mod.rs b/src/data/mod.rs
@@ -3,6 +3,7 @@
 
 use crate::traits::DataContainer;
 
+pub mod operations;
 pub mod vec;
 
 impl<U> DataContainer for Vec<U> {}

diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs
@@ -8,12 +8,18 @@ where
  U: Copy + Default + 'a,
 {
  type Item = U;
+ type OwnedItem = U;
  type Output = Array1<U>;
 
  fn get_value(&'a self, index: usize) -> Self::Item {
  self[index]
  }
 
+ fn get_owned(&'a self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
+ // already owned
+ self[index]
+ }
+
  fn len(&self) -> usize {
  self.len()
  }
@@ -32,12 +38,17 @@ where
  U: Copy + Default + 'a,
 {
  type Item = ArrayView1<'a, U>;
+ type OwnedItem = Array1<U>;
  type Output = Array2<U>;
 
  fn get_value(&'a self, index: usize) -> Self::Item {
  self.row(index)
  }
 
+ fn get_owned(&'a self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
+ self.row(index).to_owned()
+ }
+
  fn len(&self) -> usize {
  self.shape()[0]
  }

diff --git a/src/data/operations.rs b/src/data/operations.rs
@@ -0,0 +1,69 @@
+//! Implementations of various operations on data.
+//!
+
+use num_traits::{Float, ToPrimitive};
+use std::iter::Sum;
+
+/// Calculate the median.
+///
+/// This will clone and turn `numbers` into a `Vec`.
+pub fn median<F: Float + Ord + Sum>(numbers: &[F]) -> F {
+ let mut numbers = numbers.to_vec();
+ numbers.sort_unstable();
+ let mid = numbers.len() / 2;
+ if numbers.len() % 2 == 0 {
+ (numbers[mid - 1] + numbers[mid]) / F::from(2.0).unwrap()
+ } else {
+ numbers[mid]
+ }
+}
+
+/// The (subset of) standard `bedtools map` operations.
+pub enum Operation {
+ Sum,
+ Min,
+ Max,
+ Mean,
+ Median,
+ Collapse,
+}
+
+pub enum OperationResult<T>
+where
+ T: Float,
+{
+ Float(T),
+ String(String),
+}
+
+pub fn float_compute<T>(operation: Operation, data: &[T]) -> Option<OperationResult<T>>
+where
+ T: Float + Sum<T> + ToPrimitive + Ord + Clone + ToString,
+{
+ match operation {
+ Operation::Sum => {
+ let sum: T = data.iter().cloned().sum();
+ Some(OperationResult::Float(sum))
+ }
+ Operation::Min => data.iter().cloned().min().map(OperationResult::Float),
+ Operation::Max => data.iter().cloned().max().map(OperationResult::Float),
+ Operation::Mean => {
+ if data.is_empty() {
+ None
+ } else {
+ let sum: T = data.iter().cloned().sum();
+ let mean = sum / T::from(data.len()).unwrap();
+ Some(OperationResult::Float(mean))
+ }
+ }
+ Operation::Median => Some(OperationResult::Float(median(data))),
+ Operation::Collapse => {
+ let collapsed = data
+ .iter()
+ .map(|num| num.to_string())
+ .collect::<Vec<_>>()
+ .join(", ");
+ Some(OperationResult::String(collapsed))
+ }
+ }
+}
diff --git a/src/data/vec.rs b/src/data/vec.rs
@@ -5,17 +5,22 @@ use crate::traits::IndexedDataContainer;
 /// Trait methods for the commonly-used `Vec<U>` data container.
 ///
 /// Note that the associated `Item` type is always a *reference* to the data elements.
-impl<'a, U> IndexedDataContainer<'a> for Vec<U>
+impl<U> IndexedDataContainer for Vec<U>
 where
- U: Clone + 'a,
+ U: Clone,
 {
- type Item = &'a U;
+ type Item<'a> = &'a U where Self: 'a;
+ type OwnedItem = U;
  type Output = Vec<U>;
 
- fn get_value(&'a self, index: usize) -> Self::Item {
+ fn get_value(&self, index: usize) -> Self::Item<'_> {
  self.get(index).unwrap()
  }
 
+ fn get_owned(&self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
+ self.get(index).unwrap().to_owned()
+ }
+
  fn len(&self) -> usize {
  self.len()
  }

diff --git a/src/error.rs b/src/error.rs
@@ -2,7 +2,10 @@
 //!
 use crate::Position;
 use genomap::GenomeMapError;
-use std::num::{ParseFloatError, ParseIntError};
+use std::{
+ num::{ParseFloatError, ParseIntError},
+ string::FromUtf8Error,
+};
 use thiserror::Error;
 
 /// The [`GRangesError`] defines the standard set of errors that should
@@ -48,6 +51,14 @@ pub enum GRangesError {
  #[error("Invalid GRanges object: no data container.")]
  NoDataContainer,
 
+ // Sequences related errors
+ #[error("Sequence name '{0}' was not found.")]
+ MissingSequenceName(String),
+
+ // FASTA/noodles related errors
+ #[error("Error encountered in trying to convert bytes to UTF8 string.")]
+ FromUtf8Error(#[from] FromUtf8Error),
+
  // Command line tool related errors
  #[error("Unsupported genomic range format")]
  UnsupportedGenomicRangesFileFormat,

diff --git a/src/granges.rs b/src/granges.rs
@@ -1,6 +1,6 @@
 //! The [`GRanges<R, T>`] and [`GRangesEmpty`] types, and associated functionality.
 //!
-//!
+//!```text
 //! +----------------------+
 //! | GRanges<R, T> |
 //! | object |
@@ -14,6 +14,7 @@
 //! | container | | container |
 //! | (R) | | (T) |
 //! +-----------+ +-------------+
+//!```
 //!
 //! # [`GRanges<R, T>`] Generic Types
 //!
@@ -87,7 +88,7 @@ use crate::{
  ensure_eq,
  io::OutputStream,
  iterators::GRangesIterator,
- join::{JoinData, LeftGroupedJoin},
+ join::{CombinedJoinData, JoinData, LeftGroupedJoin},
  prelude::GRangesError,
  ranges::{
  coitrees::{COITrees, COITreesEmpty, COITreesIndexed},
@@ -205,13 +206,18 @@ where
  .collect();
  seqlens
  }
+
+ /// Take the data out of this [`GRanges`] object.
+ pub fn take_data(&mut self) -> Result<T, GRangesError> {
+ std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)
+ }
 }
 
 impl<'a, T> GenomicRangesTsvSerialize<'a, VecRangesIndexed> for GRanges<VecRangesIndexed, T>
 where
- T: IndexedDataContainer<'a>,
+ T: IndexedDataContainer + 'a,
  T: TsvSerialize,
- <T as IndexedDataContainer<'a>>::Item: TsvSerialize,
+ <T as IndexedDataContainer>::Item<'a>: TsvSerialize,
 {
  /// Write
  fn to_tsv(&'a self, output: Option<impl Into<PathBuf>>) -> Result<(), GRangesError> {
@@ -547,9 +553,9 @@ impl<'a, DL, DR> GRanges<VecRangesIndexed, JoinData<'a, DL, DR>> {
 
 impl<'a, T> GRanges<VecRanges<RangeIndexed>, T>
 where
- T: IndexedDataContainer<'a>,
+ T: IndexedDataContainer + 'a,
  T: TsvSerialize,
- <T as IndexedDataContainer<'a>>::Item: TsvSerialize,
+ <T as IndexedDataContainer>::Item<'a>: TsvSerialize,
 {
  /// Write this [`GRanges<VecRanges, T>`] object to a TSV file, using the
  /// [`TsvSerialize`] trait methods defiend for the items in `T`.
@@ -606,17 +612,56 @@ where
  }
 }
 
+impl<'a, DL: Clone + 'a, DR: Clone + 'a> GRanges<VecRangesIndexed, JoinData<'a, DL, DR>>
+where
+ DL: IndexedDataContainer,
+ DR: IndexedDataContainer,
+{
+ /// Apply a function over the [`JoinData`] inside this [`GRanges`].
+ ///
+ /// This is a workhorse method that is used to summarize genomic overlaps. The
+ /// user-specified function, `func` is applied to each "join" item in this [`GRanges`]
+ /// object's data container (which is a [`JoinData`] storing the join information).
+ /// This supplied `func` function returns some generic type `V` per join, which could be
+ /// e.g. a median `f64` value, a `String` of all overlap right ranges' values concatenated,
+ /// etc.
+ ///
+ /// See [`CombinedJoinData`] and its convenience methods, which are designed
+ /// to help downstream statistical calculations that could use the number of overlapping
+ /// basepairs, overlapping fraction, etc.
+ pub fn apply_over_join<F, V>(
+ mut self,
+ func: F,
+ ) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
+ where
+ F: Fn(
+ CombinedJoinData<
+ <DL as IndexedDataContainer>::OwnedItem,
+ <DR as IndexedDataContainer>::OwnedItem,
+ >,
+ ) -> V,
+ {
+ let data = self.take_data()?;
+ let transformed_data: Vec<V> = data.apply_into_vec(func);
+ let ranges = self.ranges;
+ Ok(GRanges {
+ ranges,
+ data: Some(transformed_data),
+ })
+ }
+}
+
 impl<'a, C, T> GRanges<C, T>
 where
- T: IndexedDataContainer<'a>,
+ T: IndexedDataContainer,
 {
  /// Get the data in the data container at specified index.
  ///
  /// # Panics
  /// This will panic if there if the index is invalid, or the
  /// data container is `None`. Both of these indicate internal
  /// design errors: please file an issue of you encounter a panic.
- pub fn get_data_value(&'a self, index: usize) -> <T as IndexedDataContainer>::Item {
+ pub fn get_data_value(&self, index: usize) -> <T as IndexedDataContainer>::Item<'_> {
  self.data
  .as_ref()
  .expect("data container was None")
@@ -839,7 +884,7 @@ where
  let mut gr: GRanges<VecRangesIndexed, Vec<U>> = GRanges::new_vec(&self.seqlens());
 
  let right_ref = right.as_granges_ref();
- let data = std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)?;
+ let data = self.take_data()?;
 
  let mut old_indices = HashSet::new(); // the old indices to *keep*
  let mut new_indices = Vec::new();