diff --git a/Cargo.toml b/Cargo.toml index fb5ac8a..ba3059c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,19 +17,22 @@ coitrees = { version = "0.4.0", features = ["nosimd"] } flate2 = "1.0.28" genomap = "0.2.6" indexmap = "2.2.3" -ndarray = "0.15.6" -noodles = { version = "0.63.0", features = ["core", "bed"] } +ndarray = { version = "0.15.6", optional = true} +noodles = { version = "0.63.0", features = ["core", "bed", "fasta"] } rand = "0.8.5" tempfile = "3.10.0" thiserror = "1.0.57" polars = { version = "0.37.0", optional = true } +bytes = "1.5.0" +ndarray-npy = { version = "0.8.1", optional = true } +num-traits = "0.2.18" [features] # cli = [ "clap" ] # TODO make feature dev-commands = [ ] test_bigranges = [] # NOTE: not used yet, for tests on large files polars = ["dep:polars"] - +ndarray = ["dep:ndarray", "dep:ndarray-npy"] [profile.release] opt-level = 3 diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index 89e8e91..8d5a677 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -5,7 +5,7 @@ //! ensure the output is the *exact* same. use criterion::{criterion_group, criterion_main, Criterion}; -use granges::test_utilities::{granges_binary_path, random_bedfile}; +use granges::test_utilities::{granges_binary_path, random_bed3file}; use std::process::Command; const BED_LENGTH: usize = 100_000; @@ -15,7 +15,7 @@ fn bench_range_adjustment(c: &mut Criterion) { let mut group = c.benchmark_group("adjust"); // create the test data - let input_bedfile = random_bedfile(BED_LENGTH); + let input_bedfile = random_bed3file(BED_LENGTH); // configure the sample size for the group // group.sample_size(10); @@ -59,8 +59,8 @@ fn bench_filter_adjustment(c: &mut Criterion) { let mut group = c.benchmark_group("filter"); // create the test data - let random_bedfile_left_tempfile = random_bedfile(BED_LENGTH); - let random_bedfile_right_tempfile = random_bedfile(BED_LENGTH); + let random_bedfile_left_tempfile = random_bed3file(BED_LENGTH); + let random_bedfile_right_tempfile = random_bed3file(BED_LENGTH); let random_bedfile_left = random_bedfile_left_tempfile.path(); let random_bedfile_right = random_bedfile_right_tempfile.path(); @@ -104,7 +104,7 @@ fn bench_flank(c: &mut Criterion) { let mut group = c.benchmark_group("flank"); // create the test data - let random_bedfile_tempfile = random_bedfile(BED_LENGTH); + let random_bedfile_tempfile = random_bed3file(BED_LENGTH); let random_bedfile = random_bedfile_tempfile.path(); // configure the sample size for the group diff --git a/src/commands.rs b/src/commands.rs index e239b26..e216546 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use crate::{ + data::operations::Operation, io::{parsers::GenomicRangesParser, OutputStream}, prelude::*, ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord}, @@ -401,6 +402,7 @@ pub fn granges_map( seqlens: impl Into, left_path: &PathBuf, right_path: &PathBuf, + operations: Vec, output: Option<&PathBuf>, skip_missing: bool, ) -> Result, GRangesError> { diff --git a/src/data/mod.rs b/src/data/mod.rs index 1cea0b5..c3ac1cb 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -3,6 +3,7 @@ use crate::traits::DataContainer; +pub mod operations; pub mod vec; impl DataContainer for Vec {} diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs index 2885f34..6aab541 100644 --- a/src/data/ndarray.rs +++ b/src/data/ndarray.rs @@ -8,12 +8,18 @@ where U: Copy + Default + 'a, { type Item = U; + type OwnedItem = U; type Output = Array1; fn get_value(&'a self, index: usize) -> Self::Item { self[index] } + fn get_owned(&'a self, index: usize) -> ::OwnedItem { + // already owned + self[index] + } + fn len(&self) -> usize { self.len() } @@ -32,12 +38,17 @@ where U: Copy + Default + 'a, { type Item = ArrayView1<'a, U>; + type OwnedItem = Array1; type Output = Array2; fn get_value(&'a self, index: usize) -> Self::Item { self.row(index) } + fn get_owned(&'a self, index: usize) -> ::OwnedItem { + self.row(index).to_owned() + } + fn len(&self) -> usize { self.shape()[0] } diff --git a/src/data/operations.rs b/src/data/operations.rs new file mode 100644 index 0000000..406be3a --- /dev/null +++ b/src/data/operations.rs @@ -0,0 +1,69 @@ +//! Implementations of various operations on data. +//! + +use num_traits::{Float, ToPrimitive}; +use std::iter::Sum; + +/// Calculate the median. +/// +/// This will clone and turn `numbers` into a `Vec`. +pub fn median(numbers: &[F]) -> F { + let mut numbers = numbers.to_vec(); + numbers.sort_unstable(); + let mid = numbers.len() / 2; + if numbers.len() % 2 == 0 { + (numbers[mid - 1] + numbers[mid]) / F::from(2.0).unwrap() + } else { + numbers[mid] + } +} + +/// The (subset of) standard `bedtools map` operations. +pub enum Operation { + Sum, + Min, + Max, + Mean, + Median, + Collapse, +} + +pub enum OperationResult +where + T: Float, +{ + Float(T), + String(String), +} + +pub fn float_compute(operation: Operation, data: &[T]) -> Option> +where + T: Float + Sum + ToPrimitive + Ord + Clone + ToString, +{ + match operation { + Operation::Sum => { + let sum: T = data.iter().cloned().sum(); + Some(OperationResult::Float(sum)) + } + Operation::Min => data.iter().cloned().min().map(OperationResult::Float), + Operation::Max => data.iter().cloned().max().map(OperationResult::Float), + Operation::Mean => { + if data.is_empty() { + None + } else { + let sum: T = data.iter().cloned().sum(); + let mean = sum / T::from(data.len()).unwrap(); + Some(OperationResult::Float(mean)) + } + } + Operation::Median => Some(OperationResult::Float(median(data))), + Operation::Collapse => { + let collapsed = data + .iter() + .map(|num| num.to_string()) + .collect::>() + .join(", "); + Some(OperationResult::String(collapsed)) + } + } +} diff --git a/src/data/vec.rs b/src/data/vec.rs index bd42639..42ccd79 100644 --- a/src/data/vec.rs +++ b/src/data/vec.rs @@ -5,17 +5,22 @@ use crate::traits::IndexedDataContainer; /// Trait methods for the commonly-used `Vec` data container. /// /// Note that the associated `Item` type is always a *reference* to the data elements. -impl<'a, U> IndexedDataContainer<'a> for Vec +impl IndexedDataContainer for Vec where - U: Clone + 'a, + U: Clone, { - type Item = &'a U; + type Item<'a> = &'a U where Self: 'a; + type OwnedItem = U; type Output = Vec; - fn get_value(&'a self, index: usize) -> Self::Item { + fn get_value(&self, index: usize) -> Self::Item<'_> { self.get(index).unwrap() } + fn get_owned(&self, index: usize) -> ::OwnedItem { + self.get(index).unwrap().to_owned() + } + fn len(&self) -> usize { self.len() } diff --git a/src/error.rs b/src/error.rs index 0f15f26..65f7fc3 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,7 +2,10 @@ //! use crate::Position; use genomap::GenomeMapError; -use std::num::{ParseFloatError, ParseIntError}; +use std::{ + num::{ParseFloatError, ParseIntError}, + string::FromUtf8Error, +}; use thiserror::Error; /// The [`GRangesError`] defines the standard set of errors that should @@ -48,6 +51,14 @@ pub enum GRangesError { #[error("Invalid GRanges object: no data container.")] NoDataContainer, + // Sequences related errors + #[error("Sequence name '{0}' was not found.")] + MissingSequenceName(String), + + // FASTA/noodles related errors + #[error("Error encountered in trying to convert bytes to UTF8 string.")] + FromUtf8Error(#[from] FromUtf8Error), + // Command line tool related errors #[error("Unsupported genomic range format")] UnsupportedGenomicRangesFileFormat, diff --git a/src/granges.rs b/src/granges.rs index ed191d4..6e83cfe 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -1,6 +1,6 @@ //! The [`GRanges`] and [`GRangesEmpty`] types, and associated functionality. //! -//! +//!```text //! +----------------------+ //! | GRanges | //! | object | @@ -14,6 +14,7 @@ //! | container | | container | //! | (R) | | (T) | //! +-----------+ +-------------+ +//!``` //! //! # [`GRanges`] Generic Types //! @@ -87,7 +88,7 @@ use crate::{ ensure_eq, io::OutputStream, iterators::GRangesIterator, - join::{JoinData, LeftGroupedJoin}, + join::{CombinedJoinData, JoinData, LeftGroupedJoin}, prelude::GRangesError, ranges::{ coitrees::{COITrees, COITreesEmpty, COITreesIndexed}, @@ -205,13 +206,18 @@ where .collect(); seqlens } + + /// Take the data out of this [`GRanges`] object. + pub fn take_data(&mut self) -> Result { + std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer) + } } impl<'a, T> GenomicRangesTsvSerialize<'a, VecRangesIndexed> for GRanges where - T: IndexedDataContainer<'a>, + T: IndexedDataContainer + 'a, T: TsvSerialize, - >::Item: TsvSerialize, + ::Item<'a>: TsvSerialize, { /// Write fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { @@ -547,9 +553,9 @@ impl<'a, DL, DR> GRanges> { impl<'a, T> GRanges, T> where - T: IndexedDataContainer<'a>, + T: IndexedDataContainer + 'a, T: TsvSerialize, - >::Item: TsvSerialize, + ::Item<'a>: TsvSerialize, { /// Write this [`GRanges`] object to a TSV file, using the /// [`TsvSerialize`] trait methods defiend for the items in `T`. @@ -606,9 +612,48 @@ where } } +impl<'a, DL: Clone + 'a, DR: Clone + 'a> GRanges> +where + DL: IndexedDataContainer, + DR: IndexedDataContainer, +{ + /// Apply a function over the [`JoinData`] inside this [`GRanges`]. + /// + /// This is a workhorse method that is used to summarize genomic overlaps. The + /// user-specified function, `func` is applied to each "join" item in this [`GRanges`] + /// object's data container (which is a [`JoinData`] storing the join information). + /// This supplied `func` function returns some generic type `V` per join, which could be + /// e.g. a median `f64` value, a `String` of all overlap right ranges' values concatenated, + /// etc. + /// + /// See [`CombinedJoinData`] and its convenience methods, which are designed + /// to help downstream statistical calculations that could use the number of overlapping + /// basepairs, overlapping fraction, etc. + pub fn apply_over_join( + mut self, + func: F, + ) -> Result>, GRangesError> + where + F: Fn( + CombinedJoinData< +
::OwnedItem, + ::OwnedItem, + >, + ) -> V, + { + let data = self.take_data()?; + let transformed_data: Vec = data.apply_into_vec(func); + let ranges = self.ranges; + Ok(GRanges { + ranges, + data: Some(transformed_data), + }) + } +} + impl<'a, C, T> GRanges where - T: IndexedDataContainer<'a>, + T: IndexedDataContainer, { /// Get the data in the data container at specified index. /// @@ -616,7 +661,7 @@ where /// This will panic if there if the index is invalid, or the /// data container is `None`. Both of these indicate internal /// design errors: please file an issue of you encounter a panic. - pub fn get_data_value(&'a self, index: usize) -> ::Item { + pub fn get_data_value(&self, index: usize) -> ::Item<'_> { self.data .as_ref() .expect("data container was None") @@ -839,7 +884,7 @@ where let mut gr: GRanges> = GRanges::new_vec(&self.seqlens()); let right_ref = right.as_granges_ref(); - let data = std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)?; + let data = self.take_data()?; let mut old_indices = HashSet::new(); // the old indices to *keep* let mut new_indices = Vec::new(); diff --git a/src/join.rs b/src/join.rs index 90260cd..c0d1ab9 100644 --- a/src/join.rs +++ b/src/join.rs @@ -2,7 +2,10 @@ //! #![allow(clippy::all)] -use crate::{traits::GenericRange, Position}; +use crate::{ + traits::{GenericRange, IndexedDataContainer}, + Position, +}; /// [`LeftGroupedJoin`] contains information about the right ranges /// and their degree of overlap with a focal left range. This information @@ -14,7 +17,8 @@ pub struct LeftGroupedJoin { left: Option, /// A `Vec` of the indices for the overlapping right ranges. - rights: Vec>, + /// This is `None` if the right ranges do not have a data container. + rights: Option>, /// The length of the left range. left_length: Position, @@ -39,7 +43,7 @@ impl LeftGroupedJoin { pub fn new(left_range: &R) -> Self { Self { left: left_range.index(), - rights: Vec::new(), + rights: None, left_length: left_range.width(), right_lengths: Vec::new(), overlaps: Vec::new(), @@ -52,7 +56,10 @@ impl LeftGroupedJoin { // Note: in principle, this can be called on *non-overlapping* right ranges too, // for a full-outer join. pub fn add_right(&mut self, left: &R, right: &Q) { - self.rights.push(right.index()); + if let Some(right_index) = right.index() { + // the right range has data -- add to vec, initializing if not there + self.rights.get_or_insert_with(Vec::new).push(right_index) + } self.right_lengths.push(right.width()); self.overlaps.push(left.overlap_width(right)); } @@ -112,6 +119,70 @@ impl<'a, DL, DR> JoinData<'a, DL, DR> { } } +impl<'a, DL, DR> JoinData<'a, DL, DR> +where + DL: IndexedDataContainer + 'a, + DR: IndexedDataContainer + 'a, +{ + pub fn apply_into_vec(&self, func: F) -> Vec + where + F: Fn( + CombinedJoinData< +
::OwnedItem, + ::OwnedItem, + >, + ) -> V, + { + // Cloning `left_data` and `right_data` to ensure they live long enough. + // This might not be the most efficient but ensures lifetime correctness. + + self.joins + .iter() + .map(|join| { + let left_data = join.left.and_then(|idx| { + self.left_data + .as_ref() + .and_then(|data| Some(data.get_owned(idx))) + }); + + let right_data = join.rights.as_ref().and_then(|indices| { + Some( + indices + .iter() + .filter_map(|&idx| { + self.right_data + .as_ref() + .and_then(|data| Some(data.get_owned(idx))) + }) + .collect::>(), + ) + }); + + // Now `func` is applied to each `CombinedJoinData` + func(CombinedJoinData { + left_data, + right_data, + }) + }) + .collect() + } +} + +/// Represents a combined view of a single join operation along with references to +/// associated left and right data. +/// +/// This struct is particularly useful for iterating over join results while maintaining +/// access to the original data elements that were involved in each join. It encapsulates +/// a reference to the join information (`join`), which details how two data elements are +/// related (e.g., through overlap or proximity). Additionally, it holds optional references +/// to the data elements themselves (`left_data` and `right_data`), allowing for easy retrieval +/// and inspection of the data involved in the join. +pub struct CombinedJoinData { + // pub join: LeftGroupedJoin, // The join information + pub left_data: Option
, // The left data element + pub right_data: Option>, // The right data elements +} + /// An iterator over the [`LeftGroupedJoin`] types that represent /// information about overlaps right ranges have with a particular left range. /// diff --git a/src/lib.rs b/src/lib.rs index 2d84af8..a1832e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,8 +24,8 @@ //! To understand how GRanges could be useful in processing genomic data, it's helpful to first //! understand the overall design of this software library. //! -//! Nearly all range data is loaded into GRanges using **parsing iterators** ([parsers]). -//! Parsing iterators read input data from some bioinformatics file format (e.g. `.bed`, +//! Nearly all range data is loaded into GRanges using **parsing iterators** (see the [parsers] +//! module). Parsing iterators read input data from some bioinformatics file format (e.g. `.bed`, //! `.gtf.gz`, etc) and it then goes down one of two paths: //! //! 1. Streaming mode: the entire data set is never read into memory all at once, but rather @@ -37,7 +37,7 @@ //! [`GRanges`] objects then provide high-level in-memory methods for working with this //! genomic data. //! -//! Both modes then write something to output: a TSV of some statistic calculated on the +//! Both modes then eventually write something to output: a TSV of some statistic calculated on the //! genomic data, the output of 10 million block bootstraps, or the coefficients of //! linear regressions run every kilobase on sequence data. //! @@ -46,8 +46,8 @@ //! //! ```text //! +-------+ +-------------------+ +----------------------+ -//! | Input | -----> | Iterator | -----> | GRanges object | -//! +------ + | (streaming mode) | | (in-memory mode) | +//! | Input | -----> | Parsing Iterator | -----> | GRanges object | +//! +------ + | (streaming mode) | | (in-memory mode) | //! +-------------------+ +----------------------+ //! | | //! | | @@ -107,6 +107,7 @@ pub mod io; pub mod iterators; pub mod join; pub mod ranges; +pub mod sequences; pub mod test_utilities; pub mod traits; @@ -116,6 +117,17 @@ pub mod commands; pub mod reporting; /// The main position type in GRanges. +/// +/// This type is currently an unwrapped [`u32`]. To my knowledge, +/// no chromosome is known to have a length larger than [`u32::MAX`], +/// which is 4,294,967,295, i.e. 4.29 Gigabases. +/// +/// # Stability +/// This type may change either due to (1) wrapping in a newtype, +/// and/or (2) become a [`u64`] if there is a species with a +/// single chromosome's length surpassing [`u32::MAX`]. +/// +/// [`u32::Max`]: std::u32::MAX pub type Position = u32; /// The main *signed* position type in GRanges, to represent offsets (e.g. diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 45505dd..39edda5 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -2,6 +2,8 @@ //! //! +use std::ops::Range; + use crate::{ error::GRangesError, traits::{ @@ -388,9 +390,9 @@ impl GenomicRangeIndexedRecord { self, seqnames: &[String], data: &'a T, - ) -> GenomicRangeRecord<>::Item> + ) -> GenomicRangeRecord<::Item<'a>> where - T: IndexedDataContainer<'a> + TsvSerialize, + T: IndexedDataContainer + TsvSerialize, { let data = data.get_value(self.index().unwrap()); @@ -432,16 +434,14 @@ impl AdjustableGenericRange for GenomicRangeIndexedRecord { } } -/// Validates whether a given range is valid for accessing a sequence of a given `length`. +/// Validates whether a given range is valid for accessing a sequence of a given `length`, +/// raising a [`GRangesError`] if not. /// /// # Arguments /// /// * `range` - The range to validate. /// * `length` - The length of the sequence. /// -/// # Returns -/// -/// * `bool` - `true` if the range is valid for the sequence; otherwise, `false`. pub fn validate_range( start: Position, end: Position, @@ -459,6 +459,36 @@ pub fn validate_range( Ok(()) } +/// Try converting genome positions to an right-exclusive [`std::ops::Range`], with +/// checking that the range is valid. This is predominantly used for building [`std::ops::Range`] +/// items that are used to slice (e.g. nucleotide) sequences. +/// +/// # Arugments +/// * `first` - the 0-indexed first basepair's position. +/// * `last` - the 0-indexed, last basepair's position (right-inclusive end position). +/// * `len` - the length of the sequence. +/// +/// # Returns +/// [`Some(Range)`] if the conversion could be be done successfully, otherwise +/// returns an error. +pub fn try_range( + start: Position, + end: Position, + length: Position, +) -> Result, GRangesError> { + if start >= end { + return Err(GRangesError::InvalidGenomicRange(start, end)); + } + if end > length { + return Err(GRangesError::InvalidGenomicRangeForSequence( + start, end, length, + )); + } + let start_usize: usize = start.try_into().unwrap(); + let end_usize: usize = end.try_into().unwrap(); + Ok(start_usize..end_usize) +} + #[cfg(test)] mod tests { use super::{validate_range, RangeEmpty}; diff --git a/src/sequences/lazy.rs b/src/sequences/lazy.rs new file mode 100644 index 0000000..bc611ba --- /dev/null +++ b/src/sequences/lazy.rs @@ -0,0 +1,230 @@ +//! Functionality for lazy-loading sequences off disk into memory. +//! +//! The main functionality is the very generic [`LazyLoader`]. This is generic over the loading +//! function and the key type. It mainly handles loading data into a [`RefCell`]. +//! +use std::cell::{Ref, RefCell}; + +use crate::prelude::GRangesError; + +/// A lazy-loader function that takes a reader type `R` and +/// uses it to load in data of type `T`. +type LoaderFunc = Box Result>; + +/// Lazy loader, which uses [`RefCell`] to store mutable reader and data, used for lazy loading and +/// storing one key's worth of data data. +/// +/// Generic key types allow for keys to be region tuples, etc. +/// +/// # Generics +/// * `R`: the reader type. +/// * `T`: the data type. +/// * `K`: the key type. +/// +/// +/// * `key`: they key of the loaded data (`None` if nothing loaded). +/// * `reader`: the open reader type (e.g. an indexed FASTA file). +/// * `loader`: a function that takes a key type `K` and an open reader, +/// and retrieves the right data of type `T`. +/// * `data`: the data (`None` if no data loaded). +pub struct LazyLoader +where + K: std::fmt::Debug, + T: std::fmt::Debug, +{ + key: RefCell>, + reader: RefCell, + loader: LoaderFunc, + data: RefCell>, +} + +impl std::fmt::Debug for LazyLoader +where + K: std::fmt::Debug, + T: std::fmt::Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LazyLoader") + .field("key:", &self.key) + .field("data", &self.data) + .finish_non_exhaustive() + } +} + +impl LazyLoader +where + K: std::fmt::Debug, + T: std::fmt::Debug, + K: Clone + PartialEq, +{ + pub fn new(reader: R, loader: F) -> LazyLoader + where + F: Fn(&mut R, &K) -> Result + 'static, + { + LazyLoader { + key: RefCell::new(None), + reader: RefCell::new(reader), + loader: Box::new(loader), + data: RefCell::new(None), + } + } + + /// Return a `bool` indicating whether the specified `key` is cached. + /// + /// # Arguments + /// * `key`: a `&str` key passed to the loader function. + pub fn is_loaded(&self, key: &K) -> bool { + let loaded_key = self.key.borrow(); + match &*loaded_key { + None => false, + Some(existing_key) => *key == *existing_key, + } + } + + fn load(&self, key: &K) -> Result { + let mut reader = self.reader.borrow_mut(); + let new_data = (self.loader)(&mut reader, key)?; + Ok(new_data) + } + + /// Clear out the cache. + pub fn clear(&self) { + let mut data_borrow = self.data.borrow_mut(); + *data_borrow = None; + *self.key.borrow_mut() = None; + } + + /// Check if the cache is empty. + pub fn is_empty(&self) -> bool { + let data_borrow = self.data.borrow_mut(); + let is_empty = data_borrow.is_none(); + if is_empty { + let key_borrow = self.key.borrow_mut(); + assert_eq!(*key_borrow, None, "invalid state: empty cache, but no key"); + } + is_empty + } + + /// Load the data corresponding to `key` + /// + /// Loads the data associated with a key using the + /// `loader` function. + /// + /// # Arguments + /// * `key`: a `&str` key passed to the loader function. + /// + /// # Returns + /// Returns a [`Result`]. + pub fn get_data(&self, key: &K) -> Result, GRangesError> { + { + let mut data_borrow = self.data.borrow_mut(); + let is_loaded = self.is_loaded(key); + if !is_loaded { + let new_data = self.load(key)?; + *data_borrow = Some(new_data); + *self.key.borrow_mut() = Some(key.clone()); + } + } + + let data_ref = self.data.borrow(); + Ok(Ref::map(data_ref, |opt| { + opt.as_ref().unwrap_or_else(|| { + panic!("Data should be loaded at this point, but it's not available.") + }) + })) + } +} + +pub enum LoadedType { + Sequence(String), + Region(String, i32, i32), +} + +// pub struct LazyIndexedLoader { +// key: RefCell, +// reader: RefCell, +// loader: LoaderFunc, +// query: LoaderFunc, +// data: RefCell>, +// } +// +// impl LazyIndexedLoader { +// pub fn new(reader: R, loader: F, query: P) -> LazyLoader +// where +// F: Fn(&mut R, &str) -> Result + 'static, +// P: Fn(&mut R, &str, i32, i32) -> Result + 'static, +// { +// LazyLoader { +// key: RefCell::new(String::new()), +// reader: RefCell::new(reader), +// loader: Box::new(loader), +// data: RefCell::new(None), +// } +// } +// +// /// Return a `bool` indicating whether the specified `key` is cached. +// /// +// /// # Arguments +// /// * `key`: a `&str` key passed to the loader function. +// pub fn is_loaded(&self, key: &str) -> bool { +// let loaded_key = self.key.borrow(); +// *loaded_key == key +// } +// +// fn load(&self, key: &str) -> Result { +// let mut reader = self.reader.borrow_mut(); +// let new_data = (self.loader)(&mut reader, key)?; +// Ok(new_data) +// } +// +// /// Clear out the cache. +// pub fn clear(&self) { +// let mut data_borrow = self.data.borrow_mut(); +// *data_borrow = None; +// *self.key.borrow_mut() = String::new(); +// } +// +// /// Check if the cache is empty. +// pub fn is_empty(&self) -> bool { +// let data_borrow = self.data.borrow_mut(); +// let is_empty = data_borrow.is_none(); +// if is_empty { +// let key_borrow = self.key.borrow_mut(); +// assert_eq!( +// *key_borrow, +// String::new(), +// "invalid state: empty cache, but no key" +// ); +// } +// is_empty +// } +// +// /// Load the data corresponding to `key` +// /// +// /// Loads the data associated with a key using the +// /// `loader` function. +// /// +// /// # Arguments +// /// * `key`: a `&str` key passed to the loader function. +// /// +// /// # Returns +// /// Returns a `Result`. +// pub fn get_data(&self, key: &str) -> Result, GRangesError> { +// { +// let mut data_borrow = self.data.borrow_mut(); +// let is_loaded = self.is_loaded(key); +// if !is_loaded { +// let new_data = self.load(key)?; +// *data_borrow = Some(new_data); +// *self.key.borrow_mut() = key.to_string(); +// } +// } +// +// let data_ref = self.data.borrow(); +// Ok(Ref::map(data_ref, |opt| { +// opt.as_ref().unwrap_or_else(|| { +// panic!("Data should be loaded at this point, but it's not available.") +// }) +// })) +// } +// } diff --git a/src/sequences/mod.rs b/src/sequences/mod.rs new file mode 100644 index 0000000..6bfc124 --- /dev/null +++ b/src/sequences/mod.rs @@ -0,0 +1,29 @@ +//! Functionality for working with per-basepair data. +//! +//! The [`Sequences`] trait defines an abstraction over a per-basepair genomic range data, i.e. +//! when ranges do not need to be explicitly defined because they exhaustively cover the entire +//! genome (even if some values are "missing", e.g. a float NaN). The most common form of this type +//! of data is *nucleotide* sequence data (see [`nucleotide::NucleotideSequences`]) that defines +//! the reference genome nucleotide sequence. But, other type of sequence data exists, e.g. a +//! per-basepair numeric conservation score. +//! +//! ## Main Functionality +//! +//! - Lazy-loading by sequence (e.g. chromosome) with the [`LazyLoader`] type. +//! - If the `--features=ndarray` is set, one and two dimensional per-basepair +//! numeric arrays/matrices *in-memory*, with [`NumericSequences1`] and +//! [`NumericSequences2`], or *lazy loaded* with [`LazyNumericSequences2`] +//! ([``LazyNumericSequences1`] will be added soon, please file a [GitHub +//! issue](https://github.com/vsbuffalo/granges/issues) if you need it). +//! +//! +//! [`Sequences`]: crate::traits::Sequences +//! [`LazyLoader`]: crate::sequences::lazy +//! +//! [`NumericSequences1`]: crate::sequences::numeric::NumericSequences1 +//! [`NumericSequences2`]: crate::sequences::numeric::NumericSequences2 + +pub mod lazy; +pub mod nucleotide; +#[cfg(feature = "ndarray")] +pub mod numeric; diff --git a/src/sequences/nucleotide.rs b/src/sequences/nucleotide.rs new file mode 100644 index 0000000..54d79dd --- /dev/null +++ b/src/sequences/nucleotide.rs @@ -0,0 +1,564 @@ +//! Types and methods for working with per-basepair nucleotide sequence data. +//! +//! Currently this requires the [`noodles::fasta`] module, but their API is unstable +//! and may be a source of future pain. + +use bytes::Bytes; +use genomap::GenomeMap; +use indexmap::IndexMap; +use noodles::core::Region; +use noodles::fasta::indexed_reader; +use noodles::fasta::{io::BufReadSeek, reader, record::Sequence, IndexedReader}; +use std::ops::Deref; +use std::ops::Index; +use std::path::PathBuf; +use std::str; +use std::{cell::Ref, collections::HashSet, fmt}; + +use super::lazy::LazyLoader; +use crate::prelude::GRangesError; +use crate::ranges::try_range; +use crate::traits::Sequences; +use crate::{Position, INTERNAL_ERROR_MESSAGE}; + +/// A newtype around raw nucleotide [`Bytes`], for making it more +/// display and other operations more convenient. +#[derive(Clone, Debug, PartialEq)] +pub struct Nucleotides(Bytes); + +impl fmt::Display for Nucleotides { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match str::from_utf8(&self.0) { + Ok(s) => write!(f, "{}", s), + Err(_) => Err(fmt::Error), + } + } +} + +impl Deref for Nucleotides { + type Target = Bytes; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From<&Sequence> for Nucleotides { + fn from(sequence: &Sequence) -> Self { + let seq = Bytes::from(sequence.as_ref().to_vec()); + Nucleotides(seq) + } +} + +impl From for Nucleotides { + fn from(s: String) -> Self { + let bytes = Bytes::from(s.into_bytes()); + Nucleotides(bytes) + } +} + +impl<'a> From<&'a str> for Nucleotides { + fn from(s: &'a str) -> Self { + let bytes = Bytes::from(s.as_bytes().to_vec()); + Nucleotides(bytes) + } +} + +/// A newtype around `Bytes` storage of nucleotide data +impl Nucleotides { + /// Get the length of the nucleotide sequence. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Return whether this is an empty object. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return an [`NucleotideIterator`], which iterates through all + /// bytes in a nucleotide sequence. + pub fn iter(&self) -> NucleotideIterator<'_> { + let inner = self.0.iter(); + NucleotideIterator { inner } + } +} + +/// Iterate over individual nucleotides. +pub struct NucleotideIterator<'a> { + inner: std::slice::Iter<'a, u8>, +} + +impl<'a> Iterator for NucleotideIterator<'a> { + type Item = &'a u8; + + fn next(&mut self) -> Option { + self.inner.next() + } +} + +/// [`NucleotideSequences`] for storing a whole genome's nucleotide sequence +/// data in-memory. +pub struct NucleotideSequences { + data: GenomeMap, +} + +impl NucleotideSequences { + /// Load an entire FASTA file into memory, into a [`NucleotideSequences`] object. + /// + /// # Arguments + /// * `filepath`: a path to the (possible gzipped) FASTA file. + /// * `seqnames`: an optional subset of sequences to load. + pub fn from_fasta( + filepath: impl Into, + seqnames: Option>, + ) -> Result { + let data = parse_fasta(filepath, seqnames)?; + Ok(Self { data }) + } + + /// Retrieve an [`IndexMap`] of the sequence names and their lengths. + pub fn seqlens(&self) -> IndexMap { + self.data + .iter() + .map(|(k, v)| (k.clone(), v.len().try_into().unwrap())) + .collect() + } +} + +/// Retrieve a nucleotide sequence by sequence name. +/// +/// # Returns +/// Returns a reference to a `Nucleotides` object. +impl Index<&str> for NucleotideSequences { + type Output = Nucleotides; + + fn index(&self, index: &str) -> &Self::Output { + self.get_sequence(index).unwrap() + } +} + +impl<'a> Sequences<'a> for NucleotideSequences { + type Container = &'a Nucleotides; + type Slice = &'a [u8]; + + /// Retrieve all sequence names. + fn seqnames(&self) -> Vec { + self.seqlens().keys().cloned().collect() + } + + /// Retrieve the [`Nucleotides`] for a particular sequence name. + fn get_sequence(&'a self, seqname: &str) -> Result { + self.data + .get(seqname) + .ok_or(GRangesError::MissingSequenceName(seqname.to_string())) + } + + /// Apply an arbitrary function to the specified region. + /// + /// # Arguments + /// * `func`: a function that takes [`Bytes`] and processes them, returning a generic type `V`. + /// * `seqname`: the sequence name of the region to apply the function to. + /// * `start`: the start position of the region to apply the function to. + /// * `end`: the end position of the region to apply the function to. + fn region_apply( + &'a self, + func: F, + seqname: &str, + start: Position, + end: Position, + ) -> Result + where + F: Fn(Self::Slice) -> V, + { + let seq = self.get_sequence(seqname)?; + let range = try_range(start, end, seq.len().try_into().unwrap())?; + let data = &seq[range]; + Ok(func(data)) + } + + /// Get the length of a particular sequence. + fn get_sequence_length(&self, seqname: &str) -> Result { + self.seqlens() + .get(seqname) + .ok_or(GRangesError::MissingSequenceName(seqname.to_string())) + .copied() + } +} + +/// A lazy-loaded set of nucleotide sequences based on indexed FASTA files +#[derive(Debug)] +pub struct LazyNucleotideSequences { + seqlens: IndexMap, + lazy: LazyLoader>, Nucleotides, String>, +} + +impl LazyNucleotideSequences { + /// Create a new `LazyNucleotideSequences`, which can lazyily retrieve regions + /// and whole sequences from an indexed FASTA file. + /// + /// # Arguments + /// * `filepath` - the path to the bgzipped and indexed FASTA file. + /// * `seqnames` - optional vector of sequences to consider (i.e. used for iterators over sequences). + pub fn new( + filepath: impl Into, + seqnames: Option>, + ) -> Result { + let filepath = filepath.into(); + let reader = indexed_reader::Builder::default().build_from_path(filepath)?; + let allowed_seqnames = seqnames.map(|c| c.into_iter().collect::>()); + + let seqlens: IndexMap = reader + .index() + .iter() + .filter_map(|r| { + let name = String::from_utf8(r.name().to_vec()).expect(&format!( + "{}\nError in converting FASTA name to a UTF8 String.", + &INTERNAL_ERROR_MESSAGE + )); + if allowed_seqnames + .as_ref() + .map_or(true, |seqnames| seqnames.contains(&name)) + { + Some((name, r.length().try_into().unwrap())) + } else { + None + } + }) + .collect(); + + let lazy = LazyLoader::new(reader, move |reader, seqname: &String| { + if allowed_seqnames + .as_ref() + .map_or(true, |seqnames| seqnames.contains(seqname)) + { + // NOTE: noodles API changed to use &[u8] for unknown reasons, + // which is rather annoying. + let seqname_u8 = seqname.clone().into_bytes(); + let region = Region::new(seqname_u8, ..); + let record = reader.query(®ion)?; + let seq = record.sequence(); + let nucs: Nucleotides = seq.into(); + Ok(nucs) + } else { + Err(GRangesError::MissingSequenceName(seqname.to_string())) + } + }); + + Ok(Self { seqlens, lazy }) + } + + /// Check of the lazy-loading cache is empty. + pub fn is_empty(&self) -> bool { + self.lazy.is_empty() + } + + /// Clear the lazy-loading cache. + pub fn clear(&self) { + self.lazy.clear() + } + + /// Get an [`IndexMap`] of the sequence names and their lengths. + pub fn seqlens(&self) -> IndexMap { + self.seqlens.clone() + } + + /// Return a `bool` indicating whether the specified `key` is cached. + /// + /// # Arguments + /// * `key`: a `&str` key passed to the loader function. + pub fn is_loaded(&self, key: &str) -> bool { + self.lazy.is_loaded(&key.to_string()) + } +} + +impl<'a> Sequences<'a> for LazyNucleotideSequences { + type Container = Ref<'a, Nucleotides>; + type Slice = &'a [u8]; + + /// Retrieve all sequence names. + fn seqnames(&self) -> Vec { + self.seqlens.keys().cloned().collect() + } + + /// Retrieve the [`Nucleotides`] for a particular sequence name. + fn get_sequence(&'a self, seqname: &str) -> Result { + self.lazy.get_data(&seqname.to_string()) + } + + /// Apply an arbitrary function to the specified region. + /// + /// # Arguments + /// * `func`: a function that takes [`Bytes`] and processes them, returning a generic type `V`. + /// * `seqname`: the sequence name of the region to apply the function to. + /// * `start`: the start position of the region to apply the function to. + /// * `end`: the end position of the region to apply the function to. + fn region_apply( + &'a self, + func: F, + seqname: &str, + start: Position, + end: Position, + ) -> Result + where + F: for<'b> Fn(&'b [u8]) -> V, + { + let seq = self.get_sequence(seqname)?; + let range = try_range(start, end, seq.len().try_into().unwrap())?; + Ok(func(&seq[range])) + } + + /// Get the length of a particular sequence. + fn get_sequence_length(&self, seqname: &str) -> Result { + self.seqlens + .get(seqname) + .ok_or(GRangesError::MissingSequenceName(seqname.to_string())) + .copied() + } +} + +// Convert an `Option>` into a `Option>` +fn option_vec_to_hashset(x: Option>) -> Option> { + x.map(HashSet::from_iter) +} + +/// Use the [`noodles`] library to parse a FASTA file. +pub fn parse_fasta( + filepath: impl Into, + seqnames: Option>, +) -> Result, GRangesError> { + let seqnames_set = option_vec_to_hashset(seqnames); + + let filepath = filepath.into(); + + let mut reader = reader::Builder.build_from_path(filepath)?; + + let mut sequences = GenomeMap::new(); + + for result in reader.records() { + let record = result?; + let name = String::from_utf8(record.definition().name().to_vec())?; + if seqnames_set + .as_ref() + .map_or(true, |keep_seqnames| keep_seqnames.contains(&name)) + { + let seq = Bytes::from(record.sequence().as_ref().to_vec()); + sequences.insert(&name, Nucleotides(seq))?; + } + } + + Ok(sequences) +} + +// some useful functions + +/// Calculate the GC content of a byte slice, including IUPAC ambiguous codes. +/// +/// # Arguments +/// * `seq` - a byte slice. +pub fn gc_content(seq: &[u8]) -> f64 { + let gc_count = seq + .iter() + .filter(|&&base| { + matches!( + base.to_ascii_uppercase(), + b'G' | b'C' | b'S' | b'V' | b'H' | b'D' | b'B' | b'N' + ) + }) + .count() as f64; + + let total_count = seq.len() as f64; + + if total_count == 0.0 { + 0.0 + } else { + gc_count / total_count + } +} + +/// Calculate the GC content of a byte slice, ignoring IUPAC ambiguous codes. +/// +/// # Arguments +/// * `seq` - a byte slice. +pub fn gc_content_strict(seq: &[u8]) -> f64 { + let gc_count = seq + .iter() + .filter(|&&base| matches!(base.to_ascii_uppercase(), b'G' | b'C')) + .count() as f64; + let total_count = seq + .iter() + .filter(|&&base| matches!(base.to_ascii_uppercase(), b'A' | b'T' | b'G' | b'C')) + .count() as f64; + + if total_count == 0.0 { + 0.0 + } else { + gc_count / total_count + } +} + +#[cfg(test)] +mod tests { + use super::{LazyNucleotideSequences, NucleotideSequences}; + use crate::{sequences::nucleotide::Nucleotides, traits::Sequences}; + + #[test] + fn test_nucleotide_sequences() { + let ref_file = "tests_data/sequences/small.fa.gz"; + let reference = + NucleotideSequences::from_fasta(ref_file, None).expect("could not load reference"); + + assert_eq!( + *reference.get_sequence("sequence-1").unwrap(), + Nucleotides::from("ATACGATAACCAGTAACAG") + ); + + assert_eq!(reference.seqnames().len(), 2); + assert_eq!(*reference.seqlens().get("sequence-1").unwrap(), 19); + } + + #[test] + fn test_lazyload() { + let ref_file = "tests_data/sequences/small.fa.gz"; + let reference = + LazyNucleotideSequences::new(ref_file, None).expect("could not load reference"); + + assert_eq!(*reference.seqlens.get("sequence-1").unwrap(), 19); + assert_eq!(*reference.seqlens.get("sequence-2").unwrap(), 28); + + let seqlens = reference.seqlens(); + + // Test #1: nothing should be loaded yet + assert_eq!(reference.is_loaded("sequence-1"), false); + assert_eq!(reference.is_loaded("sequence-2"), false); + + // Test #2: validate the range summary by getting whole + // chromosome sequence length through the apply funcs + let seq1_len = seqlens.get("sequence-1").unwrap(); + + fn get_len(seq: &[u8]) -> usize { + seq.len() + } + + let total_len = reference + .region_apply(get_len, "sequence-1", 0, *seq1_len) + .unwrap(); + assert_eq!(total_len, *seq1_len as usize); + + // Test #3: make sure the previous sequence is loaded still + assert_eq!(reference.is_loaded("sequence-1"), true); + + // Test #4: make sure the next sequence is not loaded yet. + assert_eq!(reference.is_loaded("sequence-2"), false); + + // Test #5: load the next sequence + let chr2_len = seqlens.get("sequence-2").unwrap(); + let total_len = reference + .region_apply(get_len, "sequence-2", 0, *chr2_len) + .unwrap(); + assert_eq!(total_len, *chr2_len as usize); + dbg!(&reference); + assert!(reference.is_loaded("sequence-2")); + } + + #[test] + fn lazy_region_apply_slice() { + let ref_file = "tests_data/sequences/small.fa.gz"; + let reference = + LazyNucleotideSequences::new(ref_file, None).expect("could not load reference"); + + #[allow(non_snake_case)] + let total_Cs = reference + .region_apply( + |seq| seq.iter().filter(|c| **c == b'C').count(), + "sequence-1", + 3, + 10, + ) + .unwrap(); + assert_eq!(total_Cs, 2); + + #[allow(non_snake_case)] + let total_As = reference + .region_apply( + |seq| seq.iter().filter(|c| **c == b'A').count(), + "sequence-1", + 3, + 10, + ) + .unwrap(); + assert_eq!(total_As, 3); + } + + #[cfg(test)] + #[cfg(feature = "human_tests")] + fn test_lazyload_human() { + use crate::{ + misc::assert_float_eq, + prelude::*, + sequences::nucleotide::{gc_content_strict, LazyNucleotideSequences}, + }; + + let ref_file = "tests_data/human/hg38.analysisSet.fa.bgz"; + let file_exists = std::fs::metadata(ref_file).is_ok(); + + let reference; + if file_exists { + let chroms: Vec = (1..=22).map(|v| format!("chr{}", v)).collect(); + assert_eq!(chroms.len(), 22); + reference = LazyNucleotideSequences::new(ref_file, Some(chroms)) + .expect("could not load reference"); + + // expected hg38 lengths + let expected_numseqs = reference.seqlens().unwrap().len(); + assert_eq!(expected_numseqs, 22); + } else { + println!( + "human reference file '{}' not found, tests skipped.", + ref_file + ); + return; + } + + fn get_len(seq: &[u8]) -> usize { + seq.len() + } + + let seqlens = reference.seqlens().unwrap(); + + // Test #1: nothing should be loaded yet + assert_eq!(reference.is_loaded("chr1"), false); + + // Test #2: validate the range summary by getting whole + // chromosome sequence length through the apply funcs + let chr1_len = seqlens.get("chr1").unwrap(); + // NOTE: the whole chromosome region is len(seq) - 1, since + // it is 0-indexed, and *right-inclusive* + let total_len = reference + .region_apply(get_len, "chr1", 0, *chr1_len - 1) + .unwrap(); + assert_eq!(total_len, *chr1_len as usize); + + // Test #3: make sure the previous sequence is loaded + // still + assert_eq!(reference.is_loaded("chr1"), true); + + // Test #4: make sure the next sequence is not loaded yet. + assert_eq!(reference.is_loaded("chr2"), false); + + // Test #5: load the next sequence + let chr2_len = seqlens.get("chr2").unwrap(); + let total_len = reference + .region_apply(get_len, "chr2", 0, *chr2_len - 1) + .unwrap(); + assert_eq!(total_len, *chr2_len as usize); + + // Test #6: hg specific -- test GC content + // TODO should go in the human test module + let gc = reference + .region_apply(gc_content_strict, "chr1", 0, *chr1_len - 1) + .unwrap(); + let expected_gc = 0.4172; + assert_float_eq(gc, expected_gc, 0.0001); + } +} diff --git a/src/sequences/numeric.rs b/src/sequences/numeric.rs new file mode 100644 index 0000000..d251fea --- /dev/null +++ b/src/sequences/numeric.rs @@ -0,0 +1,382 @@ +//! Per-basepair numeric sequence data. +//! +//! If the crate `ndarray` feature is off, this module will not be loaded, since it requires +//! [ndarray](https://crates.io/crates/ndarray). +//! +use std::{cell::Ref, path::Path}; + +use genomap::GenomeMap; +use indexmap::IndexMap; +#[cfg(feature = "ndarray")] +use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2}; +use ndarray_npy::ReadableElement; +use std::path::PathBuf; + +use super::lazy::LazyLoader; +use crate::error::GRangesError; +use crate::traits::Sequences; +use crate::Position; + +/// A one-dimensional generic numeric per-basepair container. +/// +/// This struct holds one-dimensional sequence data where each basepair position +/// is associated with a numeric value. +/// +/// # Examples +/// +/// ``` +/// use granges::sequences::numeric::NumericSequences1; +/// use granges::test_utilities::random_array1_sequences; +/// +/// let mut data = random_array1_sequences(100); +/// let numeric_seq = NumericSequences1::new(data); +/// ``` +pub struct NumericSequences1 { + data: GenomeMap>, +} + +impl NumericSequences1 { + /// Create a new in-memory per-basepair `Array1` sequence data container. + /// + /// # Arguments + /// + /// * `data` - An [`SequenceMap`] mapping sequence names to `Array1` data. + /// + /// # Examples + /// + /// ``` + /// use ndarray::Array1; + /// use granges::sequences::numeric::NumericSequences1; + /// use granges::test_utilities::random_array1_sequences; + /// + /// let mut data = random_array1_sequences(100); + /// let numeric_seq = NumericSequences1::new(data); + /// ``` + pub fn new(data: GenomeMap>) -> Self { + Self { data } + } +} + +impl<'a, T: 'a> Sequences<'a> for NumericSequences1 { + type Container = &'a Array1; + type Slice = ArrayView1<'a, T>; + + fn seqnames(&self) -> Vec { + self.data.names() + } + + fn get_sequence(&'a self, seqname: &str) -> Result { + let seq = self + .data + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + Ok(seq) + } + + /// Apply a function to `ndarray` views of the `Array1` sequence data at the specified + /// range. + /// + /// # Arguments + fn region_apply( + &'a self, + func: F, + seqname: &str, + start: Position, + end: Position, + ) -> Result + where + F: for<'b> Fn(Self::Slice) -> V, + { + let seq = self.get_sequence(seqname)?; + let start_usize: usize = start.try_into().unwrap(); + let end_usize: usize = end.try_into().unwrap(); + let view = seq.slice(s![start_usize..end_usize]); + Ok(func(view)) + } + + fn get_sequence_length(&self, seqname: &str) -> Result { + let seq = self.get_sequence(seqname)?; + Ok(seq.len().try_into().unwrap()) + } +} + +/// A two-dimensional generic numeric per-basepair container. +pub struct NumericSequences2 { + data: GenomeMap>, +} + +impl NumericSequences2 { + /// Create a new in-memory per-basepair [`Array2`] sequence data container. + /// + /// # Arguments + /// + /// * `data` - An [`SequenceMap`] mapping sequence names to `Array2` data. + /// + /// # Examples + /// + /// ``` + /// use granges::sequences::numeric::NumericSequences2; + /// use granges::test_utilities::random_array2_sequences; + /// + /// let data = random_array2_sequences(20); + /// let numeric_seq = NumericSequences2::new(data); + /// ``` + /// [`Array2`]: https://docs.rs/ndarray/latest/ndarray/type.Array2.html + pub fn new(data: GenomeMap>) -> Self { + Self { data } + } +} + +impl<'a, T: 'a> Sequences<'a> for NumericSequences2 { + type Container = &'a Array2; + type Slice = ArrayView2<'a, T>; + + /// Retrieves the names of all sequences stored in the container. + /// + /// # Returns + /// + /// A vector of strings, each representing a sequence name. + /// + /// # Examples + /// + /// ``` + /// use crate::granges::traits::Sequences; + /// use granges::sequences::numeric::NumericSequences1; + /// use granges::test_utilities::random_array1_sequences; + /// + /// let data = random_array1_sequences(20); + /// let numeric_seq = NumericSequences1::new(data); + /// let seq_names = numeric_seq.seqnames(); + /// ``` + fn seqnames(&self) -> Vec { + self.data.names() + } + + /// Gets a reference to the sequence data associated with a given sequence name. + /// + /// # Arguments + /// + /// * `seqname` - The name of the sequence to retrieve. + /// + /// # Returns + /// + /// A result containing a reference to the sequence data (`Array1`) on success, + /// or a `GRangesError` on failure. + /// + /// # Examples + /// + /// ``` + /// use granges::sequences::numeric::NumericSequences1; + /// use crate::granges::traits::Sequences; + /// use granges::test_utilities::random_array1_sequences; + /// + /// let mut data = random_array1_sequences(100); + /// let numeric_seq = NumericSequences1::new(data); + /// let sequence = numeric_seq.get_sequence("chr1").unwrap(); + /// ``` + fn get_sequence(&'a self, seqname: &str) -> Result { + let seq = self + .data + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + Ok(seq) + } + + /// Applies a given function to a slice of the sequence data for a specified range. + /// + /// # Arguments + /// + /// * `func` - A closure or function pointer that takes an `ArrayView2` slice of sequence data and returns a value. + /// * `seqname` - The name of the sequence. + /// * `start` - The start position (inclusive) of the range. + /// * `end` - The end position (inclusive) of the range. + /// + /// # Returns + /// + /// A result containing the value returned by the function `func`, or a `GRangesError` on failure. + /// + /// # Examples + /// + /// ``` + /// use crate::granges::traits::Sequences; + /// use granges::sequences::numeric::NumericSequences1; + /// use granges::test_utilities::random_array1_sequences; + /// + /// + /// let mut data = random_array1_sequences(100); + /// let numeric_seq = NumericSequences1::new(data); + /// let result = numeric_seq.region_apply( + /// |view| view.sum(), + /// "chr1", + /// 0, + /// 10 + /// ).unwrap(); + /// ``` + fn region_apply( + &'a self, + func: F, + seqname: &str, + start: Position, + end: Position, + ) -> Result + where + F: for<'b> Fn(Self::Slice) -> V, + { + let seq = self.get_sequence(seqname)?; + let start_usize: usize = start.try_into().unwrap(); + let end_usize: usize = end.try_into().unwrap(); + let view = seq.slice(s![start_usize..end_usize, ..]); + Ok(func(view)) + } + + /// Retrieves the length of a specific sequence. + /// + /// # Arguments + /// + /// * `seqname` - The name of the sequence. + /// + /// # Returns + /// + /// A result containing the length of the sequence as an [`Position`], or a + /// [`GRangesError`] on failure. + /// + /// # Examples + /// + /// ``` + /// use granges::sequences::numeric::NumericSequences1; + /// use crate::granges::traits::Sequences; + /// use granges::test_utilities::random_array1_sequences; + /// + /// let data = random_array1_sequences(20); + /// let numeric_seq = NumericSequences1::new(data); + /// + /// let seqnames = numeric_seq.seqnames(); + /// let first_seqname = seqnames.first().unwrap(); + /// let length = numeric_seq.get_sequence_length(first_seqname).unwrap(); + /// ``` + fn get_sequence_length(&self, seqname: &str) -> Result { + let seq = self.get_sequence(seqname)?; + Ok(seq.len().try_into().unwrap()) + } +} + +/// A lazy-loaded two-dimensional generic numeric per-basepair container. +pub struct LazyNumericSequences2 +where + T: std::fmt::Debug, +{ + seqlens: IndexMap, + lazy: LazyLoader, Array2, String>, +} + +impl LazyNumericSequences2 +where + T: ReadableElement + std::fmt::Debug, +{ + /// Create a link to an out-of-memory per-basepair [`Array2`] sequence data container. + /// + /// For `.npy` files, use [`ndarray_npy::read_npy`] for the `loader`. + /// + /// # Arguments + /// + /// * `dir` - the directory containing the files. + /// * `pattern` - the file pattern name with a `seqname` wildcard inside `dir`, e.g. + /// "numeric_{}.npy" where `{}` would be a name in `seqlen`. + /// + pub fn new(dir: &str, pattern: &str, loader: F, seqlens: IndexMap) -> Self + where + F: Fn(PathBuf) -> Result, GRangesError> + 'static, + { + let dir_path = Path::new(dir).to_owned(); + let pattern_owned = pattern.to_owned(); + + let lazy = LazyLoader::new(None, move |_reader, seqname: &String| { + let filename = pattern_owned.replace("{}", seqname); + let filepath = dir_path.join(filename); + let data: Array2 = loader(filepath)?; + Ok(data) + }); + + Self { seqlens, lazy } + } +} + +impl<'a, T: 'a> Sequences<'a> for LazyNumericSequences2 +where + T: std::fmt::Debug, +{ + type Container = Ref<'a, Array2>; + type Slice = ArrayView2<'a, T>; + + /// Retrieves the names of all sequences stored in the container. + /// + /// # Returns + /// + /// A vector of strings, each representing a sequence name. + /// + fn seqnames(&self) -> Vec { + self.seqlens.keys().cloned().collect() + } + + /// Gets a reference to the sequence data associated with a given sequence name. + /// + /// # Arguments + /// + /// * `seqname` - The name of the sequence to retrieve. + /// + /// # Returns + /// + /// A result containing a reference to the sequence data (`Array1`) on success, + /// or a `GRangesError` on failure. + /// + fn get_sequence(&'a self, seqname: &str) -> Result { + self.lazy.get_data(&seqname.to_string()) + } + + /// Applies a given function to a slice of the sequence data for a specified range. + /// + /// # Arguments + /// + /// * `func` - A closure or function pointer that takes an `ArrayView2` slice of sequence data and returns a value. + /// * `seqname` - The name of the sequence. + /// * `start` - The start position (inclusive) of the range. + /// * `end` - The end position (inclusive) of the range. + /// + /// # Returns + /// + /// A result containing the value returned by the function `func`, or a `GRangesError` on failure. + /// + fn region_apply( + &'a self, + func: F, + seqname: &str, + start: Position, + end: Position, + ) -> Result + where + F: for<'b> Fn(ArrayView2<'b, T>) -> V, + { + let seq = self.get_sequence(seqname)?; + let start_usize: usize = start.try_into().unwrap(); + let end_usize: usize = end.try_into().unwrap(); + let view = seq.slice(s![start_usize..end_usize, ..]); + let value = func(view); + Ok(value) + } + + /// Retrieves the length of a specific sequence. + /// + /// # Arguments + /// + /// * `seqname` - The name of the sequence. + /// + /// # Returns + fn get_sequence_length(&self, seqname: &str) -> Result { + let seq = self.get_sequence(seqname)?; + let seqlen = seq.len().try_into().unwrap(); + Ok(seqlen) + } +} + +#[cfg(test)] +mod tests {} diff --git a/src/test_utilities.rs b/src/test_utilities.rs index ac48ae3..b9b6572 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -16,7 +16,11 @@ use crate::{ }, Position, }; +#[cfg(feature = "ndarray")] +use genomap::GenomeMap; use indexmap::IndexMap; +#[cfg(feature = "ndarray")] +use ndarray::{s, Array1, Array2}; use rand::{seq::SliceRandom, thread_rng, Rng}; use tempfile::{Builder, NamedTempFile}; @@ -122,7 +126,7 @@ pub fn temp_bedfile() -> NamedTempFile { } /// Create a random BED3 file based on the hg38 sequence lengths, and write to disk. -pub fn random_bedfile(length: usize) -> NamedTempFile { +pub fn random_bed3file(length: usize) -> NamedTempFile { let temp_bedfile = temp_bedfile(); granges_random_bed( "tests_data/hg38_seqlens.tsv", @@ -171,3 +175,44 @@ pub fn granges_test_case_02() -> GRanges> { "chr2" => [(100, 200, 3.7), (250, 300, 1.1)] }, seqlens: { "chr1" => 50, "chr2" => 300 }) } + +#[cfg(feature = "ndarray")] +/// Mock numeric `NumericSequences1` data +pub fn random_array1_sequences(n: usize) -> GenomeMap> { + let mut seqs = GenomeMap::new(); + // length of sequence + let max_len = 100_000; + for seqnum in 1..=n { + let chrom = format!("chr{}", seqnum); + let array: Array1 = Array1::from_iter(1..=max_len).map(|&x| x as f64); + seqs.insert(&chrom, array).unwrap(); + } + seqs +} + +#[cfg(feature = "ndarray")] +/// Mock numeric `NumericSequences2` data +pub fn random_array2_sequences(n: usize) -> GenomeMap> { + let mut seqs = GenomeMap::new(); + // length of sequence + let max_len = 100_000; + for seqnum in 1..=n { + let chrom = format!("chr{}", seqnum); + + // Create a 2-column Array2 with max_len rows + let rows = max_len as usize; + let mut array = Array2::::zeros((rows, 2)); + + // Fill the array with values from 0 to 2 * max_len + let mut value = 0.0; + for row in array.rows_mut() { + for elem in row { + *elem = value; + value += 1.0; + } + } + + seqs.insert(&chrom, array).unwrap(); + } + seqs +} diff --git a/src/traits.rs b/src/traits.rs index 1966830..a60a474 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -3,6 +3,8 @@ use std::path::PathBuf; +use indexmap::IndexMap; + use crate::{ error::GRangesError, granges::GRanges, @@ -175,15 +177,19 @@ pub trait IntoIterableRangesContainer { /// protect against this edge case would force most `GRanges` /// functions to return a `Result`. This would clog the API usage, so /// we opt to just panic. -pub trait IndexedDataContainer<'a>: DataContainer { +pub trait IndexedDataContainer: DataContainer { // note: I don't think we can refactor out this lifetime, // since it is needed often for associated types - type Item; + type Item<'a> + where + Self: 'a; + type OwnedItem; type Output; // this type is needed to reference the core underlying type, // eg to handle reference types fn is_valid_index(&self, index: usize) -> bool; - fn get_value(&'a self, index: usize) -> ::Item; + fn get_value(&self, index: usize) -> ::Item<'_>; + fn get_owned(&self, index: usize) -> ::OwnedItem; fn len(&self) -> usize; fn is_empty(&self) -> bool { self.len() == 0 @@ -202,6 +208,44 @@ pub trait IndexedDataContainer<'a>: DataContainer { fn new_from_indices(&self, indices: &[usize]) -> Self::Output; } +/// The Sequences trait defines generic functionality for +/// per-basepair data, e.g. nucleotide sequences or some +/// per-basepair numeric score. +pub trait Sequences<'a> { + type Container: 'a; + type Slice; + + fn seqnames(&self) -> Vec; + fn get_sequence(&'a self, seqname: &str) -> Result; + fn get_sequence_length(&self, seqname: &str) -> Result; + + /// Evaluate a function on a [`Sequences::Slice`] of a sequence. + /// + /// # Arguments + /// * `func` - a function that takes a `Self::Slice` and returns a [`Result`] + /// * `seqname` - sequence name. + /// * `start` - a [`Position`] start position. + /// * `end` - a [`Position`] *inclusive* end position. + fn region_apply( + &'a self, + func: F, + seqname: &str, + start: Position, + end: Position, + ) -> Result + where + F: Fn(::Slice) -> V; + + fn seqlens(&self) -> Result, GRangesError> { + let mut seqlens = IndexMap::new(); + for seqname in self.seqnames() { + let seqlen = self.get_sequence_length(&seqname)?; + seqlens.insert(seqname, seqlen); + } + Ok(seqlens) + } +} + /// Defines how to serialize something to TSV. pub trait TsvSerialize { // Serialize something to a TSV [`String`]. diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 7d204d2..8a8228e 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -3,7 +3,7 @@ use granges::{ commands::granges_random_bed, prelude::GenomicRangesFile, - test_utilities::{granges_binary_path, random_bedfile, temp_bedfile}, + test_utilities::{granges_binary_path, random_bed3file, temp_bedfile}, }; use std::process::Command; @@ -78,8 +78,8 @@ fn test_against_bedtools_slop() { fn test_against_bedtools_intersect_wa() { let num_ranges = 1_000_000; - let random_bedfile_left_tempfile = random_bedfile(num_ranges); - let random_bedfile_right_tempfile = random_bedfile(num_ranges); + let random_bedfile_left_tempfile = random_bed3file(num_ranges); + let random_bedfile_right_tempfile = random_bed3file(num_ranges); let random_bedfile_left = random_bedfile_left_tempfile.path(); let random_bedfile_right = random_bedfile_right_tempfile.path(); @@ -133,7 +133,7 @@ fn test_against_bedtools_intersect_wa() { fn test_against_bedtools_flank() { let num_ranges = 1_000; - let random_bedfile_tempfile = random_bedfile(num_ranges); + let random_bedfile_tempfile = random_bed3file(num_ranges); let random_bedfile = random_bedfile_tempfile.path(); granges_random_bed( diff --git a/tests_data/bedtools/map_a.txt b/tests_data/bedtools/map_a.txt new file mode 100644 index 0000000..c5a6e26 --- /dev/null +++ b/tests_data/bedtools/map_a.txt @@ -0,0 +1,3 @@ +chr1 10 20 a1 1 + +chr1 50 60 a2 2 - +chr1 80 90 a3 3 - diff --git a/tests_data/bedtools/map_b.txt b/tests_data/bedtools/map_b.txt new file mode 100644 index 0000000..71a6f41 --- /dev/null +++ b/tests_data/bedtools/map_b.txt @@ -0,0 +1,5 @@ +chr1 12 14 b1 2 + +chr1 13 15 b2 5 - +chr1 16 18 b3 5 + +chr1 82 85 b4 2 - +chr1 85 87 b5 3 + diff --git a/tests_data/sequences/small.fa.gz b/tests_data/sequences/small.fa.gz new file mode 100644 index 0000000..7424654 Binary files /dev/null and b/tests_data/sequences/small.fa.gz differ diff --git a/tests_data/sequences/small.fa.gz.fai b/tests_data/sequences/small.fa.gz.fai new file mode 100644 index 0000000..38b485d --- /dev/null +++ b/tests_data/sequences/small.fa.gz.fai @@ -0,0 +1,2 @@ +sequence-1 19 12 19 20 +sequence-2 28 44 28 29 diff --git a/tests_data/sequences/small.fa.gz.gzi b/tests_data/sequences/small.fa.gz.gzi new file mode 100644 index 0000000..1b1cb4d Binary files /dev/null and b/tests_data/sequences/small.fa.gz.gzi differ