diff --git a/Cargo.toml b/Cargo.toml index 4091fa3..1f14e55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "granges" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" authors = ["Vince Buffalo "] @@ -31,9 +31,8 @@ csv = "1.3.0" serde = { version = "1.0.197", features = ["derive"] } [features] -# cli = [ "clap" ] # TODO make feature dev-commands = [ ] -test_bigranges = [] # NOTE: not used yet, for tests on large files +bench-big = [] polars = ["dep:polars"] ndarray = ["dep:ndarray", "dep:ndarray-npy"] big-position = [] diff --git a/README.md b/README.md index 422cb69..e9bdf11 100644 --- a/README.md +++ b/README.md @@ -110,23 +110,3 @@ windows 418.87 s 65.13 s 84.4515 map_median 112.35 s 82.81 s 26.2995 ``` -Note however, there is a arithmetic scaling issue. Larger range datasets -(1,000,000) lead to map operations that are inefficient. This is almost surely -due to `data/operations.rs`. Here is an example benchmark: - -``` -command bedtools time granges time granges speedup (%) ------------- --------------- -------------- --------------------- -map_multiple 26.89 min 688.60 s 57.3189 -map_max 21.34 min 21.54 min -0.94889 * -adjust 21.35 min 499.54 s 60.9959 -filter 27.01 min 20.38 min 24.5583 -map_min 935.69 s 17.48 min -12.1008 * -flank 30.26 min 943.36 s 48.0353 -map_mean 20.13 min 931.21 s 22.913 -map_sum 17.99 min 18.44 min -2.5225 * -windows 507.84 s 84.79 s 83.304 -map_median 16.90 min 17.81 min -5.36811 * -``` - -So median, sum, min, and max are slower. diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index b518d47..723a3d0 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -13,7 +13,10 @@ use std::{ process::{Command, Stdio}, }; +#[cfg(not(feature = "bench-big"))] const BED_LENGTH: usize = 100_000; +#[cfg(feature = "bench-big")] +const BED_LENGTH: usize = 1_000_000; fn bench_range_adjustment(c: &mut Criterion) { // create the benchmark group @@ -229,7 +232,7 @@ fn bench_map(c: &mut Criterion) { let mut group = c.benchmark_group(format!("map_{}", operation).to_string()); // configure the sample size for the group - group.sample_size(10); + group.sample_size(100); group.bench_function("bedtools_map", |b| { b.iter(|| { let bedtools_output_file = File::create(&bedtools_path).unwrap(); diff --git a/examples/mean_score.rs b/examples/mean_score.rs index 2891caf..14c5327 100644 --- a/examples/mean_score.rs +++ b/examples/mean_score.rs @@ -35,11 +35,9 @@ fn try_main() -> Result<(), granges::error::GRangesError> { .map_data(|bed5_cols| bed5_cols.score)?; // Compute overlaps and combine scores into mean. - let results_gr = left_gr - .left_overlaps(&right_gr)? - .map_joins(mean_score)?; + let results_gr = left_gr.left_overlaps(&right_gr)?.map_joins(mean_score)?; - results_gr.to_tsv(None::, &BED_TSV)?; + results_gr.write_to_tsv(None::, &BED_TSV)?; Ok(()) } diff --git a/src/commands.rs b/src/commands.rs index e01ba7b..dd326cb 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -105,11 +105,13 @@ pub fn granges_adjust( match ranges_iter { GenomicRangesParser::Bed3(iter) => { let gr = GRangesEmpty::from_iter(iter, &genome)?; - gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)? + gr.adjust_ranges(-both, both) + .write_to_tsv(output, &BED_TSV)? } GenomicRangesParser::Bed5(iter) => { let gr = GRanges::from_iter(iter, &genome)?; - gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)? + gr.adjust_ranges(-both, both) + .write_to_tsv(output, &BED_TSV)? } GenomicRangesParser::Bedlike(iter) => { // Note the call to try_unwrap_data() here: this is because @@ -117,7 +119,8 @@ pub fn granges_adjust( // values means that writing to TSV doesn't have to deal with this (which // always creates headaches). let gr = GRanges::from_iter(iter.try_unwrap_data(), &genome)?; - gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)? + gr.adjust_ranges(-both, both) + .write_to_tsv(output, &BED_TSV)? } GenomicRangesParser::Unsupported => { return Err(GRangesError::UnsupportedGenomicRangesFileFormat) @@ -181,7 +184,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let semijoin = left_gr.filter_overlaps(&right_gr)?; - semijoin.to_tsv(output, &BED_TSV)?; + semijoin.write_to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -203,7 +206,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let semijoin = left_gr.filter_overlaps(&right_gr)?; - semijoin.to_tsv(output, &BED_TSV)?; + semijoin.write_to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -223,7 +226,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let semijoin = left_gr.filter_overlaps(&right_gr)?; - semijoin.to_tsv(output, &BED_TSV)?; + semijoin.write_to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -246,7 +249,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let intersection = left_gr.filter_overlaps(&right_gr)?; - intersection.to_tsv(output, &BED_TSV)?; + intersection.write_to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -303,7 +306,8 @@ pub fn granges_flank( } else { GRangesEmpty::from_iter(iter, &genome)? }; - gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)? + gr.flanking_ranges(left, right)? + .write_to_tsv(output, &BED_TSV)? } GenomicRangesParser::Bed5(_iter) => { unimplemented!() @@ -314,7 +318,8 @@ pub fn granges_flank( } else { GRanges::from_iter(iter.try_unwrap_data(), &genome)? }; - gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)? + gr.flanking_ranges(left, right)? + .write_to_tsv(output, &BED_TSV)? } GenomicRangesParser::Unsupported => { return Err(GRangesError::UnsupportedGenomicRangesFileFormat) @@ -469,7 +474,7 @@ pub fn granges_map( .collect::>() })?; - result_gr.to_tsv(output, &BED_TSV)?; + result_gr.write_to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -483,7 +488,7 @@ pub fn granges_windows( output: Option>, ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; - GRangesEmpty::from_windows(&genome, width, step, chop)?.to_tsv(output, &BED_TSV)?; + GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?; let report = Report::new(); Ok(CommandOutput::new((), report)) } @@ -504,13 +509,13 @@ pub fn granges_random_bed( if sort { gr = gr.sort() } - gr.to_tsv(output, &BED_TSV)?; + gr.write_to_tsv(output, &BED_TSV)?; } else { let mut gr = random_granges(&genome, num)?; if sort { gr = gr.sort(); } - gr.to_tsv(output, &BED_TSV)?; + gr.write_to_tsv(output, &BED_TSV)?; }; let report = Report::new(); diff --git a/src/data/mod.rs b/src/data/mod.rs index 532428a..15bbd0f 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -3,9 +3,14 @@ use crate::traits::{DataContainer, IntoDatumType}; +#[cfg(feature = "ndarray")] +pub mod ndarray; pub mod operations; pub mod vec; +impl DataContainer for Vec {} +impl DataContainer for () {} + /// These are core supported data types stored in an `enum`, to /// unify the types that come out of standard operations like /// `select()`. @@ -67,6 +72,3 @@ impl From for DatumType { item.into_data_type() } } - -impl DataContainer for Vec {} -impl DataContainer for () {} diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs index 6aab541..97799ce 100644 --- a/src/data/ndarray.rs +++ b/src/data/ndarray.rs @@ -1,21 +1,159 @@ //! Data container implementations for [`ndarray::Array1`] and [`ndarray::Array2`]. -use ndarray::{Array1, Array2, ArrayView1}; -use crate::traits::IndexedDataContainer; +use crate::error::GRangesError; +use crate::granges::GRanges; +use crate::io::TsvConfig; +use crate::traits::{DataContainer, IndexedDataContainer, RangeContainer, TsvSerialize}; +use ndarray::{Array1, Array2, ArrayBase, ArrayView1, Dim, OwnedRepr}; -impl<'a, U> IndexedDataContainer<'a> for Array1 +impl DataContainer for Array1 {} +impl DataContainer for Array2 {} + +impl GRanges where - U: Copy + Default + 'a, + C: RangeContainer, { - type Item = U; + /// Take the container data into an iterator, apply a function, a put the results in an + /// [`Array1`]. + /// + /// This is a convenience function for `GRanges::map_data().into_array1()`. + /// + /// [`Array1`]: ndarray::Array1 + pub fn map_into_array1(mut self, func: F) -> Result>, GRangesError> + where + F: FnMut(::Item) -> U, + T: IntoIterator, + { + let total_len = self.len(); + let (ranges, data) = self.take_both()?; + + let transformed_data = data.into_iter().map(func).collect::>(); + + assert_eq!( + transformed_data.len(), + total_len, + "Transformed data has different length than GRanges!" + ); + + Ok(GRanges { + ranges, + data: Some(transformed_data), + }) + } + + /// Take the container data into an iterator, apply a function, a put the results in an + /// [`Array2`]. + /// + /// This is a convenience function for `GRanges::map_data().into_array2()`. + /// + /// [`Array2`]: ndarray::Array2 + pub fn map_into_array2( + mut self, + ncols: usize, + func: F, + ) -> Result>, GRangesError> + where + F: FnMut(::Item) -> Vec, + T: IntoIterator, + { + let total_len = self.len(); + let (ranges, data) = self.take_both()?; + + let flat_vec: Vec = data.into_iter().flat_map(func).collect(); + let transformed_data = Array2::from_shape_vec((total_len, ncols), flat_vec)?; + + assert_eq!( + transformed_data.shape()[0], + total_len, + "Transformed data has different length than GRanges!" + ); + + Ok(GRanges { + ranges, + data: Some(transformed_data), + }) + } + + /// Use [`IntoIterator`] to convert the data container into a new [`Array1`] data container. + /// + /// This is like [`GRanges::map_into_array1`] without applying a function to each element. + /// + /// [`Array1`]: ndarray::Array1 + pub fn into_array1( + mut self, + ) -> Result::Item>>, GRangesError> + where + T: IntoIterator, + { + let total_len = self.len(); + let (ranges, data) = self.take_both()?; + + let transformed_data = data + .into_iter() + .collect::::Item>>(); + + assert_eq!( + transformed_data.len(), + total_len, + "Transformed data has different length than GRanges!" + ); + + Ok(GRanges { + ranges, + data: Some(transformed_data), + }) + } + + /// Use [`IntoIterator`] to convert the data container into a new [`Array2`] data container. + /// + /// This is like [`GRanges::map_into_array2`] without applying a function to each element. + /// + /// [`Array2`]: ndarray::Array2 + pub fn into_array2( + mut self, + ncols: usize, + ) -> Result::Item as IntoIterator>::Item>>, GRangesError> + where + C: RangeContainer, + T: IntoIterator, + T::Item: IntoIterator, + <::Item as IntoIterator>::Item: Copy + 'static, + { + let total_len = self.len(); + let (ranges, data) = self.take_both()?; + + let flat_vec: Vec<_> = data + .into_iter() + .flat_map(|sub_iter| sub_iter.into_iter()) + .collect(); + let transformed_data = Array2::from_shape_vec((total_len, ncols), flat_vec)?; + + assert_eq!( + transformed_data.shape()[0], + total_len, + "Transformed data has different length than GRanges!" + ); + + Ok(GRanges { + ranges, + data: Some(transformed_data), + }) + } +} + +impl IndexedDataContainer for Array1 +where + U: Copy + Default + 'static, +{ + type Item<'a> = U; type OwnedItem = U; type Output = Array1; - fn get_value(&'a self, index: usize) -> Self::Item { + fn get_value(&self, index: usize) -> Self::Item<'_> { self[index] } - fn get_owned(&'a self, index: usize) -> ::OwnedItem { + fn get_owned(&self, index: usize) -> ::OwnedItem { // already owned self[index] } @@ -33,19 +171,19 @@ where } } -impl<'a, U> IndexedDataContainer<'a> for Array2 +impl IndexedDataContainer for Array2 where - U: Copy + Default + 'a, + U: Copy + Default + 'static, { - type Item = ArrayView1<'a, U>; + type Item<'a> = ArrayView1<'a, U>; type OwnedItem = Array1; type Output = Array2; - fn get_value(&'a self, index: usize) -> Self::Item { + fn get_value(&self, index: usize) -> Self::Item<'_> { self.row(index) } - fn get_owned(&'a self, index: usize) -> ::OwnedItem { + fn get_owned(&self, index: usize) -> ::OwnedItem { self.row(index).to_owned() } @@ -71,3 +209,62 @@ where .expect("Shape and collected data size mismatch") } } + +impl TsvSerialize for ArrayBase, Dim<[usize; 1]>> { + fn to_tsv(&self, config: &TsvConfig) -> String { + self.iter() + .map(|x| x.to_tsv(config)) + .collect::>() + .join("\t") + } +} + +#[cfg(test)] +mod tests { + use crate::test_utilities::granges_test_case_01; + + #[test] + fn test_map_into_array1() { + let gr = granges_test_case_01(); + let new_gr = gr.map_into_array1(|x| x + 1.0).unwrap(); + // note: 5.0, because 5 ranges each 1 added to + assert!((new_gr.data().unwrap().sum() - (24.1 + 5.0)).abs() < 1e-4); + } + + #[test] + fn test_map_into_array2() { + let gr = granges_test_case_01(); + let new_gr = gr.map_into_array2(2, |x| vec![x + 1.0, 1.0]).unwrap(); + // note: 5.0, because 5 ranges each 1 added to + let array2 = new_gr.data().unwrap(); + let first_col_sum = array2.sum_axis(ndarray::Axis(0))[0]; + let second_col_sum = array2.sum_axis(ndarray::Axis(0))[1]; + assert!((first_col_sum - (24.1 + 5.0)).abs() < 1e-4); + assert!((second_col_sum - (5.0)).abs() < 1e-4); + } + + #[test] + fn test_into_array1() { + let gr = granges_test_case_01(); + let new_gr = gr.into_array1().unwrap(); + assert!((new_gr.data().unwrap().sum() - 24.1).abs() < 1e-4); + } + + #[test] + fn test_into_array2() { + // test case: just wipe out the data and replace with (1, 2) + // rows; check sum + let gr = granges_test_case_01(); + let new_gr = gr + .map_data(|_| vec![1.0, 2.0]) + .unwrap() + .into_array2(2) + .unwrap(); + + let array2 = new_gr.data().unwrap(); + let first_col_sum = array2.sum_axis(ndarray::Axis(0))[0]; + let second_col_sum = array2.sum_axis(ndarray::Axis(0))[1]; + assert!(((first_col_sum as f64) - 5.0).abs() < 1e-4); + assert!(((second_col_sum as f64) - 10.0).abs() < 1e-4); + } +} diff --git a/src/error.rs b/src/error.rs index 2884a88..3204519 100644 --- a/src/error.rs +++ b/src/error.rs @@ -137,6 +137,9 @@ pub enum GRangesError { #[error("The GRanges object is invalid because it lacks an associated data container. Ensure data is loaded or associated with the GRanges object before attempting operations that require data.")] NoDataContainer, + #[error("The supplied GRanges object and data container cannot be united into a new GRanges since they have differing lengths.")] + IncompatableGRangesAndData, + // Sequences related errors #[error("The sequence name '{0}' was not found in the dataset. Verify the sequence names and try again.")] MissingSequenceName(String), @@ -159,4 +162,9 @@ pub enum GRangesError { #[error("No operation was specified. See granges map --help.")] NoOperationSpecified, + + // ndarray related errors + #[cfg(feature = "ndarray")] + #[error("Invalid shape encountered by ndarray: {0}")] + InvalidNdarrayShape(#[from] ndarray::ShapeError), } diff --git a/src/granges.rs b/src/granges.rs index 4bf73ee..394eac8 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -74,109 +74,141 @@ pub struct GRanges { #[derive(Clone, Debug)] pub struct GRangesEmpty(GRanges); -impl GRangesEmpty +impl GRanges where C: RangeContainer, { /// Get the total number of ranges. pub fn len(&self) -> usize { - self.0.ranges.values().map(|ranges| ranges.len()).sum() + self.ranges.values().map(|ranges| ranges.len()).sum() } /// Return whether the [`GRanges`] object is empty (contains no ranges). pub fn is_empty(&self) -> bool { - self.0.len() == 0 + self.len() == 0 } /// Get the raw range container. pub fn get_ranges(&self, seqname: &str) -> Option<&C> { - self.0.ranges.get(seqname) + self.ranges.get(seqname) } /// Get the sequence names. pub fn seqnames(&self) -> Vec { - self.0.ranges.names() + self.ranges.names() } /// Get the sequences lengths. pub fn seqlens(&self) -> IndexMap { let seqlens = self - .0 .ranges .iter() .map(|(seqname, ranges)| (seqname.to_string(), ranges.sequence_length())) .collect(); seqlens } -} -impl From> for GRanges { - fn from(value: GRangesEmpty) -> Self { - value.0 + /// Get a reference to the data container. + pub fn data(&self) -> Option<&T> { + self.data.as_ref() } -} -impl<'a, C> AsGRangesRef<'a, C, ()> for GRangesEmpty { - /// Convert a reference to a [`GRangesEmpty`] to a reference to the - /// underlying [`GRanges`]. This is to greatly improve the ergonomics - /// of functions that could take either a [`GRanges`] or [`GRangesEmpty] type. - fn as_granges_ref(&'a self) -> &'a GRanges { - &self.0 + /// Take the data out of this [`GRanges`] object. + // NOTE: I have considered removing the Result here. + // + // This refactor does not seem worth it. There are good reasons in related + // functions to keep [`Result`] there, e.g. range pushing methods + // could hit invalid ranges or chromosome names (user-time, non-dev issue). + // Even though this is unlikely to raise an error in proper use, there is + // no real benefit to removing the non-happy path here. + pub fn take_data(&mut self) -> Result { + std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer) } -} -impl<'a, C, T> AsGRangesRef<'a, C, T> for GRanges { - /// Return a reference of a [`GRanges`] object. This is essentially - /// a pass-through method. [`AsGRangesRef`] is not needed in this case, - /// but is needed elsewhere (see the implementation for [`GRangesEmpty`]) to - /// improve the ergonomics of working with [`GRanges`] and [`GRangesEmpty`] types. - fn as_granges_ref(&'a self) -> &'a GRanges { - self + /// Take the ranges out of this [`GRanges`] object. + pub fn take_ranges(&mut self) -> GenomeMap { + std::mem::take(&mut self.ranges) + } + + pub fn take_both(&mut self) -> Result<(GenomeMap, T), GRangesError> { + let data = std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)?; + let ranges = std::mem::take(&mut self.ranges); + Ok((ranges, data)) } } impl GRanges +where + C: RangeContainer + Clone, +{ + /// Create a new [`GRanges`] object by cloning the ranges of this one, + /// and associating the supplied data with it (this consumes the data). + pub fn clone_with_data(&self, data: Option) -> GRanges { + GRanges { + ranges: self.ranges.clone(), + data, + } + } +} + +impl GRangesEmpty where C: RangeContainer, { /// Get the total number of ranges. pub fn len(&self) -> usize { - self.ranges.values().map(|ranges| ranges.len()).sum() + self.0.ranges.values().map(|ranges| ranges.len()).sum() } /// Return whether the [`GRanges`] object is empty (contains no ranges). pub fn is_empty(&self) -> bool { - self.len() == 0 + self.0.len() == 0 } /// Get the raw range container. pub fn get_ranges(&self, seqname: &str) -> Option<&C> { - self.ranges.get(seqname) + self.0.ranges.get(seqname) } /// Get the sequence names. pub fn seqnames(&self) -> Vec { - self.ranges.names() + self.0.ranges.names() } /// Get the sequences lengths. pub fn seqlens(&self) -> IndexMap { let seqlens = self + .0 .ranges .iter() .map(|(seqname, ranges)| (seqname.to_string(), ranges.sequence_length())) .collect(); seqlens } +} - /// Take the data out of this [`GRanges`] object. - // NODE/TODO?: this is used a lot -- using .expect() - // reduce Results a lot. - pub fn take_data(&mut self) -> Result { - std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer) +impl From> for GRanges { + fn from(value: GRangesEmpty) -> Self { + value.0 } - pub fn take_ranges(&mut self) -> GenomeMap { - std::mem::take(&mut self.ranges) +} + +impl<'a, C> AsGRangesRef<'a, C, ()> for GRangesEmpty { + /// Convert a reference to a [`GRangesEmpty`] to a reference to the + /// underlying [`GRanges`]. This is to greatly improve the ergonomics + /// of functions that could take either a [`GRanges`] or [`GRangesEmpty] type. + fn as_granges_ref(&'a self) -> &'a GRanges { + &self.0 + } +} + +impl<'a, C, T> AsGRangesRef<'a, C, T> for GRanges { + /// Return a reference of a [`GRanges`] object. This is essentially + /// a pass-through method. [`AsGRangesRef`] is not needed in this case, + /// but is needed elsewhere (see the implementation for [`GRangesEmpty`]) to + /// improve the ergonomics of working with [`GRanges`] and [`GRangesEmpty`] types. + fn as_granges_ref(&'a self) -> &'a GRanges { + self } } @@ -193,7 +225,7 @@ where /// * `output`: either `None` (for standard out) or file path. If the filepath /// ends in `.gz`, the output will be gzip-compressed. /// * `config`: a [`TsvConfig`], which contains the TSV output settings. - fn to_tsv( + fn write_to_tsv( &'a self, output: Option>, config: &TsvConfig, @@ -222,7 +254,7 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges /// # Arguments /// * `output`: either `None` (for standard out) or file path. /// * `config`: a [`TsvConfig`], which contains the TSV output settings. - fn to_tsv( + fn write_to_tsv( &'a self, output: Option>, config: &TsvConfig, @@ -286,6 +318,117 @@ impl GRanges, T> { } } +impl GRanges +where + C: IterableRangeContainer, +{ + /// Consume the current [`GRanges`] into a new [`GRanges`]. + /// + /// This will change all ranges to [`RangeEmpty`], since + /// indices are now invalid. + /// + /// Note the converse operation, going from a [`GRangesEmpty`] + /// to a [`GRanges`] can be done with `GRangesEmpty::into()`. + pub fn into_granges_empty(self) -> Result, GRangesError> { + let seqlens = self.seqlens(); + let mut ranges = GenomeMap::new(); + for (seqname, ranges_indexed) in self.ranges.iter() { + // unwrap should be safe, since seqname is produced from ranges iterator. + let seqlen = seqlens.get(seqname).unwrap(); + let mut ranges_empty = VecRangesEmpty::new(*seqlen); + for range in ranges_indexed.iter_ranges() { + ranges_empty.push_range(range.into()); + } + ranges.insert(seqname, ranges_empty)?; + } + Ok(GRangesEmpty(GRanges { ranges, data: None })) + } +} + +impl GRanges +where + ::RangeType: GenericRange, +{ + /// Retrieve all midpoints. + /// + /// These are calculated as (start + end)/2 in [`Position`], + /// which will truncate them. + pub fn midpoints(&self) -> Result>, GRangesError> { + let mut all_midpoints = GenomeMap::new(); + for (seqname, ranges) in self.ranges.iter() { + // unwrap should be safe, since seqname is produced from ranges iterator. + let mut midpoints = Vec::new(); + for range in ranges.iter_ranges() { + midpoints.push(range.midpoint()); + } + all_midpoints.insert(seqname, midpoints)?; + } + Ok(all_midpoints) + } +} + +impl GRanges +where + C: IterableRangeContainer, +{ + /// Get the indices of all ranges, putting them in a new + /// [`GenomeMap>`]. + pub fn data_indices(&self) -> Result>, GRangesError> { + let mut all_indices = GenomeMap::new(); + for (seqname, ranges) in self.ranges.iter() { + // unwrap should be safe, since seqname is produced from ranges iterator. + let mut indices = Vec::new(); + for range in ranges.iter_ranges() { + indices.push(range.index); + } + all_indices.insert(seqname, indices)?; + } + Ok(all_indices) + } +} + +impl GRanges> +where + C: IterableRangeContainer, +{ + /// Create a new [`GenomeMap>`] of clones of the data in the + /// data container, grouped by chromosome. + pub fn data_by_seqname(&self) -> Result>, GRangesError> { + let data = self.data().ok_or(GRangesError::NoDataContainer)?; + let mut all_new_data = GenomeMap::new(); + for (seqname, indices) in self.data_indices()? { + let mut new_data = Vec::new(); + for index in indices { + let value = data.get(index).unwrap(); + new_data.push((*value).clone()); + } + all_new_data.insert(&seqname, new_data)?; + } + Ok(all_new_data) + } +} + +impl GRanges> +where + C: IterableRangeContainer, +{ + /// Create a new [`GenomeMap>`] of references to the data in the + /// data container, grouped by chromosome. + pub fn data_refs_by_seqname(&self) -> Result>, GRangesError> { + let data = self.data().ok_or(GRangesError::NoDataContainer)?; + let mut all_new_data = GenomeMap::new(); + for (seqname, indices) in self.data_indices()? { + let mut new_data = Vec::new(); + for index in indices { + let value = data.get(index).unwrap(); + new_data.push(value); + } + all_new_data.insert(&seqname, new_data)?; + } + Ok(all_new_data) + } +} + impl GRanges where VecRangesIndexed: IterableRangeContainer, @@ -474,6 +617,19 @@ where } } +impl GRangesEmpty +where + ::RangeType: GenericRange, +{ + /// Retrieve all midpoints. + /// + /// These are calculated as (start + end)/2 in [`Position`], + /// which will truncate them. + pub fn midpoints(&self) -> Result>, GRangesError> { + self.0.midpoints() + } +} + impl GRanges> { /// Push a genomic range with its data to the range and data containers in a [`GRanges] object. pub fn push_range( @@ -1037,6 +1193,38 @@ impl GRangesEmpty { } } +impl GRanges { + /// Convert the [`COITreesIndexed`] range containers in this [`GRanges`] to a + /// [`VecRangesIndexed`]. + pub fn into_vecranges(self) -> Result, GRangesError> { + let old_ranges = self.ranges; + let mut new_ranges: GenomeMap = GenomeMap::new(); + for (seqname, trees) in old_ranges.into_iter() { + new_ranges.insert(&seqname, trees.into())?; + } + Ok(GRanges { + ranges: new_ranges, + data: self.data, + }) + } +} + +impl GRangesEmpty { + /// Convert the [`COITreesEmpty`] range containers in this [`GRanges`] to a + /// [`VecRangesEmpty`]. + pub fn into_vecranges(self) -> Result, GRangesError> { + let old_ranges = self.0.ranges; + let mut new_ranges: GenomeMap = GenomeMap::new(); + for (seqname, trees) in old_ranges.into_iter() { + new_ranges.insert(&seqname, trees.into())?; + } + Ok(GRangesEmpty(GRanges { + ranges: new_ranges, + data: None, + })) + } +} + impl GRangesEmpty where COITrees<()>: From, @@ -1558,4 +1746,30 @@ mod tests { let coit = vec.clone().into_coitrees().unwrap(); assert_eq!(coit, vec); } + + #[test] + fn test_midpoints() { + let gr = granges_test_case_02(); + let midpoints = gr.midpoints().unwrap(); + let mut iter = midpoints.iter(); + + let first_chrom = iter.next().unwrap(); + assert_eq!(first_chrom.0, "chr1"); + assert_eq!(*first_chrom.1, vec![40]); + let second_chrom = iter.next().unwrap(); + assert_eq!(second_chrom.0, "chr2"); + assert_eq!(*second_chrom.1, vec![150, 275]); + } + + #[test] + fn test_midpoints_empty() { + // and harder test case + let gr = granges_test_case_01().into_granges_empty().unwrap(); + let midpoints = gr.midpoints().unwrap(); + let mut iter = midpoints.iter(); + + let first_chrom = iter.next().unwrap(); + assert_eq!(first_chrom.0, "chr1"); + assert_eq!(*first_chrom.1, vec![2, 5, 13]); + } } diff --git a/src/io/mod.rs b/src/io/mod.rs index 3fb324a..d804f08 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -9,4 +9,4 @@ pub use parsers::{ bed::Bed3Iterator, bed::Bed5Iterator, bed::BedlikeIterator, tsv::TsvRecordIterator, GenomicRangesFile, GenomicRangesParser, }; -pub use tsv::BED_TSV; +pub use tsv::{TsvConfig, BED_TSV}; diff --git a/src/io/parsers/mod.rs b/src/io/parsers/mod.rs index d567591..28ab3a6 100644 --- a/src/io/parsers/mod.rs +++ b/src/io/parsers/mod.rs @@ -79,6 +79,8 @@ use crate::Position; use self::utils::get_base_extension; +use super::TsvRecordIterator; + /// Inspect the first line to check that it looks like a valid BED-like /// file, i.e. the first column is there (there are no reasonable checks /// for sequence names other than presence), and the next to columns can @@ -361,6 +363,19 @@ where } } +impl GeneralRangeRecordIterator> + for TsvRecordIterator> +where + U: Clone + for<'de> serde::Deserialize<'de>, +{ + fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames(self, seqnames: &[String]) -> FilteredRanges> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + impl GeneralRangeRecordIterator>> for BedlikeIterator { fn retain_seqnames( self, diff --git a/src/lib.rs b/src/lib.rs index d51b56b..18710f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -185,7 +185,7 @@ //! // Write to a TSV file, using the BED TSV format standards //! // for missing values, etc. //! let tempfile = tempfile::NamedTempFile::new().unwrap(); -//! results_gr.to_tsv(Some(tempfile.path()), &BED_TSV)?; +//! results_gr.write_to_tsv(Some(tempfile.path()), &BED_TSV)?; //! # Ok(()) //! # } //! # fn main() { try_main().unwrap(); } @@ -411,7 +411,10 @@ pub mod prelude { }; pub use crate::data::DatumType; - pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; + pub use crate::ranges::{ + try_range, + vec::{VecRangesEmpty, VecRangesIndexed}, + }; pub use crate::traits::{ AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, GenomicRangeRecordUnwrappable, GenomicRangesTsvSerialize, IndexedDataContainer, diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index b5db7be..419ab6f 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -1,6 +1,6 @@ //! The [`COITrees`] type. //! -//! This wraps the functionality og the [`coitrees`] library by Daniel C. Jones. +//! This wraps the functionality on the [`coitrees`] library by Daniel C. Jones. //! use coitrees::{BasicCOITree, GenericInterval, Interval, IntervalNode, IntervalTree}; @@ -83,7 +83,7 @@ impl, M: Clone> From> for COITrees } } -/// Convert a [`COITrees`] range container to a [`VecRanges`] range container. +/// Convert a [`COITreesIndexed`] range container to a [`VecRanges`] range container. impl From> for VecRanges { fn from(value: COITrees) -> Self { let length = value.length; @@ -95,8 +95,23 @@ impl From> for VecRanges { } } +/// Convert a [`COITreesEmpty`] range container to a [`VecRangesEmpty`] range container. +/// +///[`VecRangesEmpty`]: crate::ranges::vec::VecRangesEmpty +impl From> for VecRanges { + fn from(value: COITrees<()>) -> Self { + let length = value.length; + let mut ranges: VecRanges = VecRanges::new(length); + for interval in value.ranges.iter() { + ranges.push_range(interval.into()); + } + ranges + } +} + /// [`RangeContainer`] trait implementations. -impl RangeContainer for COITrees { +impl RangeContainer for COITrees { + type InternalRangeType = IntervalNode; fn len(&self) -> usize { self.ranges.len() } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index abe8310..5291922 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -86,6 +86,15 @@ impl AdjustableGenericRange for RangeEmpty { } } +impl From for RangeEmpty { + fn from(value: RangeIndexed) -> Self { + RangeEmpty { + start: value.start, + end: value.end, + } + } +} + /// [`RangeIndexed`] is a range with a valid /// index to a data element in the data container. #[derive(Clone, Debug, Default, PartialEq)] diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index 34fb825..5652b6b 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -73,7 +73,8 @@ impl VecRanges { } } -impl RangeContainer for VecRanges { +impl RangeContainer for VecRanges { + type InternalRangeType = R; fn len(&self) -> usize { self.ranges.len() } diff --git a/src/sequences/nucleotide.rs b/src/sequences/nucleotide.rs index ac1c223..b476492 100644 --- a/src/sequences/nucleotide.rs +++ b/src/sequences/nucleotide.rs @@ -138,9 +138,9 @@ impl Index<&str> for NucleotideSequences { } } -impl<'a> Sequences<'a> for NucleotideSequences { - type Container = &'a Nucleotides; - type Slice = &'a [u8]; +impl Sequences for NucleotideSequences { + type Container<'a> = &'a Nucleotides; + type Slice<'a> = &'a [u8]; /// Retrieve all sequence names. fn seqnames(&self) -> Vec { @@ -148,7 +148,7 @@ impl<'a> Sequences<'a> for NucleotideSequences { } /// Retrieve the [`Nucleotides`] for a particular sequence name. - fn get_sequence(&'a self, seqname: &str) -> Result { + fn get_sequence(&self, seqname: &str) -> Result, GRangesError> { self.data .get(seqname) .ok_or(GRangesError::MissingSequenceName(seqname.to_string())) @@ -161,15 +161,15 @@ impl<'a> Sequences<'a> for NucleotideSequences { /// * `seqname`: the sequence name of the region to apply the function to. /// * `start`: the start position of the region to apply the function to. /// * `end`: the end position of the region to apply the function to. - fn region_apply( - &'a self, - func: F, + fn region_map( + &self, + func: &F, seqname: &str, start: Position, end: Position, ) -> Result where - F: Fn(Self::Slice) -> V, + F: Fn(Self::Slice<'_>) -> V, { let seq = self.get_sequence(seqname)?; let range = try_range(start, end, seq.len().try_into().unwrap())?; @@ -179,10 +179,8 @@ impl<'a> Sequences<'a> for NucleotideSequences { /// Get the length of a particular sequence. fn get_sequence_length(&self, seqname: &str) -> Result { - self.seqlens() - .get(seqname) - .ok_or(GRangesError::MissingSequenceName(seqname.to_string())) - .copied() + let len: Position = self.get_sequence(seqname)?.len().try_into().unwrap(); + Ok(len) } } @@ -222,6 +220,8 @@ impl LazyNucleotideSequences { .as_ref() .map_or(true, |seqnames| seqnames.contains(&name)) { + #[allow(clippy::useless_conversion)] // following clippy's suggestion causes + // compile time error Some((name, r.length().try_into().unwrap())) } else { None @@ -274,9 +274,9 @@ impl LazyNucleotideSequences { } } -impl<'a> Sequences<'a> for LazyNucleotideSequences { - type Container = Ref<'a, Nucleotides>; - type Slice = &'a [u8]; +impl Sequences for LazyNucleotideSequences { + type Container<'a> = Ref<'a, Nucleotides>; + type Slice<'a> = &'a [u8]; /// Retrieve all sequence names. fn seqnames(&self) -> Vec { @@ -284,7 +284,7 @@ impl<'a> Sequences<'a> for LazyNucleotideSequences { } /// Retrieve the [`Nucleotides`] for a particular sequence name. - fn get_sequence(&'a self, seqname: &str) -> Result { + fn get_sequence(&self, seqname: &str) -> Result, GRangesError> { self.lazy.get_data(&seqname.to_string()) } @@ -295,9 +295,9 @@ impl<'a> Sequences<'a> for LazyNucleotideSequences { /// * `seqname`: the sequence name of the region to apply the function to. /// * `start`: the start position of the region to apply the function to. /// * `end`: the end position of the region to apply the function to. - fn region_apply( - &'a self, - func: F, + fn region_map( + &self, + func: &F, seqname: &str, start: Position, end: Position, @@ -401,79 +401,81 @@ pub fn gc_content_strict(seq: &[u8]) -> f64 { #[cfg(test)] mod tests { - use super::{LazyNucleotideSequences, NucleotideSequences}; - use crate::{sequences::nucleotide::Nucleotides, traits::Sequences}; + use super::{gc_content_strict, LazyNucleotideSequences, NucleotideSequences}; + use crate::{granges::GRangesEmpty, sequences::nucleotide::Nucleotides, traits::Sequences}; #[test] fn test_nucleotide_sequences() { - let ref_file = "tests_data/sequences/small.fa.gz"; + let ref_file = "tests_data/sequences/test_case_01.fa.gz"; let reference = NucleotideSequences::from_fasta(ref_file, None).expect("could not load reference"); assert_eq!( - *reference.get_sequence("sequence-1").unwrap(), - Nucleotides::from("ATACGATAACCAGTAACAG") + *reference.get_sequence("chr1").unwrap(), + Nucleotides::from("TTCACTACTATTAGTACTCACGGCGCAATA") ); assert_eq!(reference.seqnames().len(), 2); - assert_eq!(*reference.seqlens().get("sequence-1").unwrap(), 19); + assert_eq!(*reference.seqlens().get("chr1").unwrap(), 30); } #[test] fn test_lazyload() { - let ref_file = "tests_data/sequences/small.fa.gz"; + let ref_file = "tests_data/sequences/test_case_01.fa.gz"; let reference = LazyNucleotideSequences::new(ref_file, None).expect("could not load reference"); - assert_eq!(*reference.seqlens.get("sequence-1").unwrap(), 19); - assert_eq!(*reference.seqlens.get("sequence-2").unwrap(), 28); + assert_eq!(*reference.seqlens.get("chr1").unwrap(), 30); + assert_eq!(*reference.seqlens.get("chr2").unwrap(), 100); let seqlens = reference.seqlens(); // Test #1: nothing should be loaded yet - assert_eq!(reference.is_loaded("sequence-1"), false); - assert_eq!(reference.is_loaded("sequence-2"), false); + assert_eq!(reference.is_loaded("chr1"), false); + assert_eq!(reference.is_loaded("chr2"), false); // Test #2: validate the range summary by getting whole // chromosome sequence length through the apply funcs - let seq1_len = seqlens.get("sequence-1").unwrap(); + let seq1_len = seqlens.get("chr1").unwrap(); fn get_len(seq: &[u8]) -> usize { seq.len() } let total_len = reference - .region_apply(get_len, "sequence-1", 0, *seq1_len) + .region_map(&get_len, "chr1", 0, *seq1_len) .unwrap(); assert_eq!(total_len, *seq1_len as usize); // Test #3: make sure the previous sequence is loaded still - assert_eq!(reference.is_loaded("sequence-1"), true); + assert_eq!(reference.is_loaded("chr1"), true); // Test #4: make sure the next sequence is not loaded yet. - assert_eq!(reference.is_loaded("sequence-2"), false); + assert_eq!(reference.is_loaded("chr2"), false); // Test #5: load the next sequence - let chr2_len = seqlens.get("sequence-2").unwrap(); + let chr2_len = seqlens.get("chr2").unwrap(); let total_len = reference - .region_apply(get_len, "sequence-2", 0, *chr2_len) + .region_map(&get_len, "chr2", 0, *chr2_len) .unwrap(); assert_eq!(total_len, *chr2_len as usize); dbg!(&reference); - assert!(reference.is_loaded("sequence-2")); + assert!(reference.is_loaded("chr2")); } #[test] fn lazy_region_apply_slice() { - let ref_file = "tests_data/sequences/small.fa.gz"; + let ref_file = "tests_data/sequences/test_case_01.fa.gz"; let reference = LazyNucleotideSequences::new(ref_file, None).expect("could not load reference"); #[allow(non_snake_case)] + // pycode for comparison + // len([l for l in 'TTCACTACTATTAGTACTCACGGCGCAATA'[3:10] if l == 'C']) let total_Cs = reference - .region_apply( - |seq| seq.iter().filter(|c| **c == b'C').count(), - "sequence-1", + .region_map( + &|seq| seq.iter().filter(|c| **c == b'C').count(), + "chr1", 3, 10, ) @@ -481,10 +483,12 @@ mod tests { assert_eq!(total_Cs, 2); #[allow(non_snake_case)] + // pycode for comparison + // len([l for l in 'TTCACTACTATTAGTACTCACGGCGCAATA'[3:10] if l == 'A']) let total_As = reference - .region_apply( - |seq| seq.iter().filter(|c| **c == b'A').count(), - "sequence-1", + .region_map( + &|seq| seq.iter().filter(|c| **c == b'A').count(), + "chr1", 3, 10, ) @@ -492,6 +496,57 @@ mod tests { assert_eq!(total_As, 3); } + #[test] + fn test_map_into_granges() { + let ref_file = "tests_data/sequences/test_case_01.fa.gz"; + let reference = + LazyNucleotideSequences::new(ref_file, None).expect("could not load reference"); + + let seqlens = reference.seqlens(); + + let windows = GRangesEmpty::from_windows(&seqlens, 10, None, false).unwrap(); + + let make_string = |seq: &[u8]| String::from_utf8(seq.to_vec()).unwrap(); + + // our test: get subsequences (as Strings), with *no step*, and reconstruct the + // original sequence + let seqs_windows = reference + .region_map_into_granges(&windows, &make_string) + .unwrap(); + let subseqs = seqs_windows.data_refs_by_seqname().unwrap(); + let subseqs_chr1 = subseqs.get("chr1").unwrap(); + let chr1_reconstructed: Nucleotides = subseqs_chr1 + .iter() + .map(AsRef::as_ref) + .collect::>() + .join("") + .into(); + let chr1_seq = reference.get_sequence("chr1").unwrap(); + assert_eq!(chr1_reconstructed, *chr1_seq); + } + + #[test] + fn test_map_into_granges_gc() { + let ref_file = "tests_data/sequences/test_case_01.fa.gz"; + let reference = + LazyNucleotideSequences::new(ref_file, None).expect("could not load reference"); + + let seqlens = reference.seqlens(); + + let windows = GRangesEmpty::from_windows(&seqlens, 10, Some(5), false).unwrap(); + + // our test: calc GC content + let gc_windows = reference + .region_map_into_granges(&windows, &gc_content_strict) + .unwrap(); + let gc = gc_windows.data_by_seqname().unwrap(); + // TODO: double check? calc'd with help from py but should hand calc. + assert_eq!( + gc.get("chr1").unwrap().to_vec(), + vec![0.3, 0.2, 0.3, 0.7, 0.6, 0.2] + ); + } + #[cfg(test)] #[cfg(feature = "human_tests")] fn test_lazyload_human() { diff --git a/src/sequences/numeric.rs b/src/sequences/numeric.rs index d251fea..72f1351 100644 --- a/src/sequences/numeric.rs +++ b/src/sequences/numeric.rs @@ -9,11 +9,11 @@ use genomap::GenomeMap; use indexmap::IndexMap; #[cfg(feature = "ndarray")] use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2}; -use ndarray_npy::ReadableElement; use std::path::PathBuf; use super::lazy::LazyLoader; use crate::error::GRangesError; +use crate::ranges::try_range; use crate::traits::Sequences; use crate::Position; @@ -57,15 +57,19 @@ impl NumericSequences1 { } } -impl<'a, T: 'a> Sequences<'a> for NumericSequences1 { - type Container = &'a Array1; - type Slice = ArrayView1<'a, T>; +// TODO/NEEDS_TEST +impl Sequences for NumericSequences1 +where + T: Copy + Default + Sized + 'static, // f64-like traits +{ + type Container<'a> = &'a Array1; + type Slice<'a> = ArrayView1<'a, T>; fn seqnames(&self) -> Vec { self.data.names() } - fn get_sequence(&'a self, seqname: &str) -> Result { + fn get_sequence(&self, seqname: &str) -> Result, GRangesError> { let seq = self .data .get(seqname) @@ -77,20 +81,19 @@ impl<'a, T: 'a> Sequences<'a> for NumericSequences1 { /// range. /// /// # Arguments - fn region_apply( - &'a self, - func: F, + fn region_map( + &self, + func: &F, seqname: &str, start: Position, end: Position, ) -> Result where - F: for<'b> Fn(Self::Slice) -> V, + F: for<'b> Fn(Self::Slice<'b>) -> V, { let seq = self.get_sequence(seqname)?; - let start_usize: usize = start.try_into().unwrap(); - let end_usize: usize = end.try_into().unwrap(); - let view = seq.slice(s![start_usize..end_usize]); + let range = try_range(start, end, seq.len().try_into().unwrap())?; + let view = seq.slice(s![range]); Ok(func(view)) } @@ -127,9 +130,12 @@ impl NumericSequences2 { } } -impl<'a, T: 'a> Sequences<'a> for NumericSequences2 { - type Container = &'a Array2; - type Slice = ArrayView2<'a, T>; +impl Sequences for NumericSequences2 +where + T: Copy + Default + Sized + 'static, // f64-like traits +{ + type Container<'a> = &'a Array2; + type Slice<'a> = ArrayView2<'a, T>; /// Retrieves the names of all sequences stored in the container. /// @@ -174,7 +180,7 @@ impl<'a, T: 'a> Sequences<'a> for NumericSequences2 { /// let numeric_seq = NumericSequences1::new(data); /// let sequence = numeric_seq.get_sequence("chr1").unwrap(); /// ``` - fn get_sequence(&'a self, seqname: &str) -> Result { + fn get_sequence(&self, seqname: &str) -> Result, GRangesError> { let seq = self .data .get(seqname) @@ -201,31 +207,32 @@ impl<'a, T: 'a> Sequences<'a> for NumericSequences2 { /// use crate::granges::traits::Sequences; /// use granges::sequences::numeric::NumericSequences1; /// use granges::test_utilities::random_array1_sequences; + /// use ndarray::ArrayView1; /// /// /// let mut data = random_array1_sequences(100); /// let numeric_seq = NumericSequences1::new(data); - /// let result = numeric_seq.region_apply( - /// |view| view.sum(), + /// let result = numeric_seq.region_map( + /// &|view: ArrayView1<'_, f64> | view.sum(), /// "chr1", /// 0, /// 10 /// ).unwrap(); /// ``` - fn region_apply( - &'a self, - func: F, + fn region_map( + &self, + func: &F, seqname: &str, start: Position, end: Position, ) -> Result where - F: for<'b> Fn(Self::Slice) -> V, + F: for<'b> Fn(Self::Slice<'_>) -> V, { let seq = self.get_sequence(seqname)?; - let start_usize: usize = start.try_into().unwrap(); - let end_usize: usize = end.try_into().unwrap(); - let view = seq.slice(s![start_usize..end_usize, ..]); + let range = try_range(start, end, seq.len().try_into().unwrap())?; + let seq = self.get_sequence(seqname)?; + let view = seq.slice(s![range, ..]); Ok(func(view)) } @@ -261,18 +268,12 @@ impl<'a, T: 'a> Sequences<'a> for NumericSequences2 { } /// A lazy-loaded two-dimensional generic numeric per-basepair container. -pub struct LazyNumericSequences2 -where - T: std::fmt::Debug, -{ +pub struct LazyNumericSequences2 { seqlens: IndexMap, lazy: LazyLoader, Array2, String>, } -impl LazyNumericSequences2 -where - T: ReadableElement + std::fmt::Debug, -{ +impl LazyNumericSequences2 { /// Create a link to an out-of-memory per-basepair [`Array2`] sequence data container. /// /// For `.npy` files, use [`ndarray_npy::read_npy`] for the `loader`. @@ -301,12 +302,12 @@ where } } -impl<'a, T: 'a> Sequences<'a> for LazyNumericSequences2 +impl Sequences for LazyNumericSequences2 where - T: std::fmt::Debug, + T: Copy + Default + Sized + 'static + std::fmt::Debug, // f64-like traits { - type Container = Ref<'a, Array2>; - type Slice = ArrayView2<'a, T>; + type Container<'a> = Ref<'a, Array2>; + type Slice<'a> = ArrayView2<'a, T>; /// Retrieves the names of all sequences stored in the container. /// @@ -329,7 +330,7 @@ where /// A result containing a reference to the sequence data (`Array1`) on success, /// or a `GRangesError` on failure. /// - fn get_sequence(&'a self, seqname: &str) -> Result { + fn get_sequence(&self, seqname: &str) -> Result, GRangesError> { self.lazy.get_data(&seqname.to_string()) } @@ -346,9 +347,9 @@ where /// /// A result containing the value returned by the function `func`, or a `GRangesError` on failure. /// - fn region_apply( - &'a self, - func: F, + fn region_map( + &self, + func: &F, seqname: &str, start: Position, end: Position, @@ -357,9 +358,8 @@ where F: for<'b> Fn(ArrayView2<'b, T>) -> V, { let seq = self.get_sequence(seqname)?; - let start_usize: usize = start.try_into().unwrap(); - let end_usize: usize = end.try_into().unwrap(); - let view = seq.slice(s![start_usize..end_usize, ..]); + let range = try_range(start, end, seq.len().try_into().unwrap())?; + let view = seq.slice(s![range, ..]); let value = func(view); Ok(value) } diff --git a/src/test_utilities.rs b/src/test_utilities.rs index b9aa00a..9a66f20 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -201,6 +201,9 @@ pub fn random_bed5file(length: usize) -> NamedTempFile { /// Range test case #1 /// +/// This also has a corresponding reference sequence +/// test file in: `tests_data/sequences/test_case_01.fa.gz` +/// /// Ranges: /// - chr1: /// (0, 5, Some(1.1)) @@ -213,6 +216,7 @@ pub fn random_bed5file(length: usize) -> NamedTempFile { /// Seqlens: { "chr1" => 30, "chr2" => 100 } /// /// Sum of all elements: 24.1 +/// pub fn granges_test_case_01() -> GRanges> { create_granges_with_seqlens!(VecRangesIndexed, Vec, { "chr1" => [(0, 5, 1.1), (4, 7, 8.1), (10, 17, 10.1)], diff --git a/src/traits.rs b/src/traits.rs index e76c5ec..8fffea5 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -14,6 +14,7 @@ use crate::{ tsv::TsvConfig, }, join::LeftGroupedJoin, + prelude::VecRangesIndexed, ranges::GenomicRangeRecord, Position, }; @@ -34,6 +35,12 @@ pub trait AsGRangesRef<'a, C, T> { /// The [`LeftOverlaps`] trait provides compile time polymorphic behavior /// over its associated [`LeftOverlaps::Output`] type and its `Right` /// generic type. +/// +/// # Errors +/// This needs to return [`Result`] because: +/// - [`GRanges::push_range_with_join()`], etc could fail, due to an invalid +/// range, chromosome name, etc. +/// - Taking an empty data container. pub trait LeftOverlaps<'a, Right> { type Output; @@ -44,7 +51,7 @@ pub trait LeftOverlaps<'a, Right> { /// object, for some mix of generic types, to a TSV file. pub trait GenomicRangesTsvSerialize<'a, C: RangeContainer> { /// Output the TSV version of this [`GRanges`] object. - fn to_tsv( + fn write_to_tsv( &'a self, output: Option>, config: &TsvConfig, @@ -59,6 +66,9 @@ pub trait GenericRange: Clone { fn width(&self) -> Position { self.end() - self.start() } + fn midpoint(&self) -> Position { + (self.start() + self.end()) / 2 + } /// Calculate how many basepairs overlap this range and other. fn overlap_width(&self, other: &R) -> Position { let overlap_start = std::cmp::max(self.start(), other.start()); @@ -151,6 +161,7 @@ pub trait AdjustableGenericRange: GenericRange { /// [`VecRanges`]: crate::ranges::vec::VecRanges /// [`COITrees`]: crate::ranges::coitrees::COITrees pub trait RangeContainer { + type InternalRangeType; // the internal stored range type fn len(&self) -> usize; fn is_empty(&self) -> bool { self.len() == 0 @@ -195,7 +206,7 @@ where Self: RangeContainer, ::RangeType: GenericRange, { - type RangeType: GenericRange; + type RangeType: GenericRange; // the iterator range type fn iter_ranges(&self) -> Box + '_>; } @@ -258,30 +269,44 @@ pub trait IndexedDataContainer: DataContainer { /// The Sequences trait defines generic functionality for /// per-basepair data, e.g. nucleotide sequences or some /// per-basepair numeric score. -pub trait Sequences<'a> { - type Container: 'a; - type Slice; +pub trait Sequences { + type Container<'a>: 'a + where + Self: 'a; + type Slice<'a>; fn seqnames(&self) -> Vec; - fn get_sequence(&'a self, seqname: &str) -> Result; + fn get_sequence(&self, seqname: &str) -> Result, GRangesError>; fn get_sequence_length(&self, seqname: &str) -> Result; - /// Evaluate a function on a [`Sequences::Slice`] of a sequence. + /// Apply a function on a [`Sequences::Slice`] of a sequence. /// /// # Arguments /// * `func` - a function that takes a `Self::Slice` and returns a [`Result`] /// * `seqname` - sequence name. /// * `start` - a [`Position`] start position. /// * `end` - a [`Position`] *inclusive* end position. - fn region_apply( - &'a self, - func: F, + /// + /// If you're implementing this trait method, it *must* being with: + /// + /// ```no_run + /// use granges::prelude::try_range; + /// // let seq = self.get_sequence(seqname)?; // e.g. this normally + /// let seq = "GATAGAGAGTAGAGTA"; + /// let range = try_range(0, 10, seq.len().try_into().unwrap()).unwrap(); + /// ``` + /// + /// to validate the range and to avoid panics. + /// + fn region_map( + &self, + func: &F, seqname: &str, start: Position, end: Position, ) -> Result where - F: Fn(::Slice) -> V; + F: Fn(::Slice<'_>) -> V; fn seqlens(&self) -> Result, GRangesError> { let mut seqlens = IndexMap::new(); @@ -291,6 +316,34 @@ pub trait Sequences<'a> { } Ok(seqlens) } + + /// Create a new [`GRanges>`] by apply the function `func` on + /// the genomic ranges from `granges`. + /// + /// # Arguments + fn region_map_into_granges<'b, C, F, V, T: 'b>( + &self, + granges: &'b impl AsGRangesRef<'b, C, T>, + func: &F, + ) -> Result>, GRangesError> + where + V: Clone, + C: IterableRangeContainer + 'b, + F: Fn(::Slice<'_>) -> V, + { + let granges_ref = granges.as_granges_ref(); + let seqlens = &granges_ref.seqlens().clone(); + let mut gr: GRanges> = GRanges::new_vec(seqlens); + for (seqname, ranges) in granges_ref.ranges.iter() { + // unwrap should be safe, since seqname is produced from ranges iterator. + for range in ranges.iter_ranges() { + let (start, end) = (range.start(), range.end()); + let value = self.region_map(&func, seqname, start, end)?; + gr.push_range(seqname, start, end, value.clone())?; + } + } + Ok(gr) + } } /// Defines how to serialize something to TSV. diff --git a/tests_data/sequences/Makefile b/tests_data/sequences/Makefile new file mode 100644 index 0000000..191fd3e --- /dev/null +++ b/tests_data/sequences/Makefile @@ -0,0 +1,18 @@ +# does all the bgzip/indexing of test files + +.PHONY: clean all + +all: test_case_01.fa.gz.fai test_case_01.fa.gz.gzi + + +test_case_01.fa.gz: test_case_01.fa + bgzip -c $< > $@ + + +test_case_01.fa.gz.fai test_case_01.fa.gz.gzi: test_case_01.fa.gz + samtools faidx $< > $@ + +clean: + rm -f test_case_01.fa.gz.fai test_case_01.fa.gz.gzi test_case_01.fa.gz.fai test_case_01.fa.gz + + diff --git a/tests_data/sequences/small.fa.gz b/tests_data/sequences/small.fa.gz deleted file mode 100644 index 7424654..0000000 Binary files a/tests_data/sequences/small.fa.gz and /dev/null differ diff --git a/tests_data/sequences/small.fa.gz.fai b/tests_data/sequences/small.fa.gz.fai deleted file mode 100644 index 38b485d..0000000 --- a/tests_data/sequences/small.fa.gz.fai +++ /dev/null @@ -1,2 +0,0 @@ -sequence-1 19 12 19 20 -sequence-2 28 44 28 29 diff --git a/tests_data/sequences/test_case_01.fa b/tests_data/sequences/test_case_01.fa new file mode 100644 index 0000000..0e8ba06 --- /dev/null +++ b/tests_data/sequences/test_case_01.fa @@ -0,0 +1,4 @@ +>chr1 +TTCACTACTATTAGTACTCACGGCGCAATA +>chr2 +CCACCACAGCCTTGTCTCGCCAGAATGCTGGTCAGCATACGGAAGAGCTCAAGGCAGGTCAATTCGCACTGTCAGGGTCACATGGGTGTTTGGCACTACC diff --git a/tests_data/sequences/test_case_01.fa.gz b/tests_data/sequences/test_case_01.fa.gz new file mode 100644 index 0000000..cc4f966 Binary files /dev/null and b/tests_data/sequences/test_case_01.fa.gz differ diff --git a/tests_data/sequences/test_case_01.fa.gz.fai b/tests_data/sequences/test_case_01.fa.gz.fai new file mode 100644 index 0000000..c4b7c2b --- /dev/null +++ b/tests_data/sequences/test_case_01.fa.gz.fai @@ -0,0 +1,2 @@ +chr1 30 6 30 31 +chr2 100 43 100 101 diff --git a/tests_data/sequences/small.fa.gz.gzi b/tests_data/sequences/test_case_01.fa.gz.gzi similarity index 100% rename from tests_data/sequences/small.fa.gz.gzi rename to tests_data/sequences/test_case_01.fa.gz.gzi