From 32969950a7b5bf1b15fa4508a82c5d3b80b39441 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sun, 25 Feb 2024 11:12:08 -0800 Subject: [PATCH] `left_overlaps()` bug fix + tests, doc. - New `to_tsv()` doc. - Fixed issue with `left_overlaps()`. - New `left_overlaps()` tests. - New example in main doc. - New `TsvSerialize` impls. --- src/granges.rs | 67 +++++++++++++++++++++++++++++++++++++------ src/io/mod.rs | 4 ++- src/io/tsv.rs | 60 ++++++++++++++++++++++++++++++++++++++ src/lib.rs | 67 ++++++++++++++++++++++++++++++++++++++++++- src/test_utilities.rs | 16 +++++++++++ 5 files changed, 203 insertions(+), 11 deletions(-) diff --git a/src/granges.rs b/src/granges.rs index e4894f1..3c4a93b 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -224,7 +224,13 @@ where T: TsvSerialize, ::Item<'a>: TsvSerialize, { - /// Write + /// Write this [`GRanges`] object to a TSV file to `output`, using the [`TsvConfig`] + /// specified with `config`. + /// + /// # Arguments + /// * `output`: either `None` (for standard out) or file path. If the filepath + /// ends in `.gz`, the output will be gzip-compressed. + /// * `config`: a [`TsvConfig`], which contains the TSV output settings. fn to_tsv( &'a self, output: Option>, @@ -247,7 +253,14 @@ where } impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRangesEmpty { - /// Output a BED3 file for for this data-less [`GRanges`]. + /// Write this [`GRangesEmpty`] object to a TSV file to `output`, using the [`TsvConfig`] + /// specified with `config`. Since this [`GRangesEmpty`] contains no data, the output + /// will be a BED3 file. + /// + /// # Arguments + /// * `output`: either `None` (for standard out) or file path. If the filepath + /// ends in `.gz`, the output will be gzip-compressed. + /// * `config`: a [`TsvConfig`], which contains the TSV output settings. fn to_tsv( &'a self, output: Option>, @@ -654,7 +667,7 @@ where } /// [`GRanges::left_overlaps()`] for the left with data, right empty case. -impl<'a, DL: 'a> LeftOverlaps<'a, GRangesEmpty> for GRanges +impl<'a, DL: 'a> LeftOverlaps<'a, GRangesEmpty> for GRanges where DL: IndexedDataContainer + 'a, { @@ -706,9 +719,8 @@ where } /// [`GRanges::left_overlaps()`] for the left empty, right with data case. -impl<'a, C, DR: 'a> LeftOverlaps<'a, GRanges> for GRangesEmpty +impl<'a, DR: 'a> LeftOverlaps<'a, GRanges> for GRangesEmpty where - C: IterableRangeContainer, DR: IndexedDataContainer + 'a, { type Output = GRanges, JoinDataLeftEmpty<'a, DR>>; @@ -759,7 +771,10 @@ where } /// [`GRanges::left_overlaps()`] for the left empty, right empty case. -impl<'a> LeftOverlaps<'a, GRangesEmpty> for GRangesEmpty { +impl<'a, C> LeftOverlaps<'a, GRangesEmpty> for GRangesEmpty +where + C: IterableRangeContainer, +{ type Output = GRanges, JoinDataBothEmpty>; /// Conduct a left overlap join, consuming self and returning a new @@ -1446,7 +1461,25 @@ mod tests { } #[test] - fn test_apply_over_joins() { + fn test_left_with_data_right_empty() { + let sl = seqlens!("chr1" => 50); + let mut left_gr: GRanges> = GRanges::new_vec(&sl); + left_gr.push_range("chr1", 1, 2, 1.1).unwrap(); + left_gr.push_range("chr1", 5, 7, 2.8).unwrap(); + + let windows = GRangesEmpty::from_windows(&sl, 10, None, true) + .unwrap() + .into_coitrees() + .unwrap(); + + let joined_results = left_gr.left_overlaps(&windows).unwrap(); + + // check the joined results + assert_eq!(joined_results.len(), 2) + } + + #[test] + fn test_map_over_joins() { let sl = seqlens!("chr1" => 50); let windows: GRangesEmpty = GRangesEmpty::from_windows(&sl, 10, None, true).unwrap(); @@ -1460,8 +1493,24 @@ mod tests { let right_gr = right_gr.into_coitrees().unwrap(); // TODO - let joined_results = windows.left_overlaps(&right_gr).unwrap(); - // joined_results.apply_over_joins(); + let joined_results = windows + .left_overlaps(&right_gr) + .unwrap() + .map_over_joins(|join_data| { + let overlap_scores = join_data.right_data; + overlap_scores.iter().sum::() + }) + .unwrap(); + + let mut iter = joined_results.iter_ranges(); + let first_range = iter.next().unwrap(); + assert_eq!(first_range.start, 0); + assert_eq!(first_range.end, 10); + + let mut data_iter = joined_results.data.as_ref().unwrap().iter(); + assert_eq!(*data_iter.next().unwrap(), 5.0); + assert_eq!(*data_iter.next().unwrap(), 0.0); + assert_eq!(*data_iter.next().unwrap(), 4.1); } #[test] diff --git a/src/io/mod.rs b/src/io/mod.rs index 7362132..93c2234 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -6,5 +6,7 @@ pub mod tsv; pub use file::{InputStream, OutputStream}; pub use parsers::{ - Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator, + Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, + TsvRecordIterator, }; +pub use tsv::BED_TSV; diff --git a/src/io/tsv.rs b/src/io/tsv.rs index dc0429f..efac925 100644 --- a/src/io/tsv.rs +++ b/src/io/tsv.rs @@ -56,6 +56,66 @@ impl TsvSerialize for &Vec { } } +impl TsvSerialize for &f64 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + // TODO precision from config + format!("{}", self).to_string() + } +} + +impl TsvSerialize for f64 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + // TODO precision from config + format!("{}", self).to_string() + } +} + +impl TsvSerialize for &f32 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + // TODO precision from config + format!("{}", self).to_string() + } +} + +impl TsvSerialize for f32 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + // TODO precision from config + format!("{}", self).to_string() + } +} + +impl TsvSerialize for &i64 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + format!("{}", self).to_string() + } +} + +impl TsvSerialize for i64 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + format!("{}", self).to_string() + } +} + +impl TsvSerialize for &i32 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + format!("{}", self).to_string() + } +} + +impl TsvSerialize for i32 { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + format!("{}", self).to_string() + } +} + impl TsvSerialize for DatumType { fn to_tsv(&self, config: &TsvConfig) -> String { match self { diff --git a/src/lib.rs b/src/lib.rs index c2b6324..c4b41c4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,6 +18,68 @@ //! analagous runtime dynamic data structures that would allow the library to be wrapped and used //! by other languages. //! +//! ## Example +//! +//! The best introduction to the GRanges library is an example illustrating +//! how a common genomic data processing task can be expressed in few lines +//! of (relatively simple) code. Below is an annotated implementation of +//! `granges map`, which provides the same functionality of `bedtools map`: +//! arbitrary operations (e.g. mean, median, max, etc) are applied to the numeric +//! scores of all right ranges that overlap a each left range: +//! +//! ``` +//! use granges::prelude::*; +//! +//! // Data for example: +//! let genome = seqlens!("chr1" => 100, "chr2" => 100); +//! let left_path = "tests_data/bedtools/map_a.txt"; +//! let right_path = "tests_data/bedtools/map_b.txt"; +//! +//! // Read in the "genome file" of chromosomes and their lengths. +//! let seqnames: Vec = genome.keys().cloned().collect(); +//! +//! // Create parsing iterators to the left and right BED files. +//! let left_iter = Bed3Iterator::new(left_path) +//! .expect("error inferring filetype"); +//! let right_iter = Bed5Iterator::new(right_path) +//! .expect("error inferring filetype"); +//! +//! // Filter out any ranges from chromosomes not in our genome file. +//! let left_gr = GRangesEmpty::from_iter(left_iter.retain_seqnames(&seqnames), &genome) +//! .expect("error parsing file"); +//! let right_gr = GRanges::from_iter(right_iter.retain_seqnames(&seqnames), &genome) +//! .expect("error parsing file"); +//! +//! +//! // Create the "right" GRanges object, convert the ranges to an +//! // interval trees, and tidy it by selecting out a f64 score. +//! let right_gr = { +//! right_gr +//! // Convert to interval trees. +//! .into_coitrees().expect("error computing interval trees") +//! // Extract out just the score from the additional BED5 columns. +//! .map_data(|bed5_cols| { +//! bed5_cols.score +//! }).expect("error selecting score") +//! }; +//! +//! // Find the overlaps by doing a *left grouped join*. +//! let left_join_gr = left_gr.left_overlaps(&right_gr) +//! .expect("error in computing overlaps"); +//! +//! // Process all the overlaps. +//! let result_gr = left_join_gr.map_over_joins(|join_data| { +//! // Get the "right data" -- the BED5 scores. +//! let overlap_scores = join_data.right_data; +//! let score_sum: f64 = overlap_scores.iter().sum(); +//! score_sum / (overlap_scores.len() as f64) +//! }).expect("error computing mean score"); +//! +//! // Write to a TSV file, using the BED TSV format standards +//! // for missing values, etc. +//! let path = Some("map_results.bed.gz"); +//! result_gr.to_tsv(path, &BED_TSV).expect("error writing output"); +//! ``` //! //! ## Design Overview //! @@ -151,10 +213,13 @@ pub mod prelude { pub use crate::error::GRangesError; pub use crate::granges::{GRanges, GRangesEmpty}; pub use crate::io::file::read_seqlens; + pub use crate::io::tsv::BED_TSV; pub use crate::io::{ - Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator, + Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, + TsvRecordIterator, }; + pub use crate::data::DatumType; pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; pub use crate::traits::{ AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, diff --git a/src/test_utilities.rs b/src/test_utilities.rs index b9b6572..23b6e6b 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -138,6 +138,22 @@ pub fn random_bed3file(length: usize) -> NamedTempFile { temp_bedfile } +/// Create a random BED5 file based on the hg38 sequence lengths, and write to disk. +/// +/// The feature names are random lowercase characters, and the scores are +/// random floats. +pub fn random_bed5file(length: usize) -> NamedTempFile { + let temp_bedfile = temp_bedfile(); + granges_random_bed( + "tests_data/hg38_seqlens.tsv", + length, + Some(temp_bedfile.path()), + true, + ) + .expect("could not generate random BED file"); + temp_bedfile +} + /// Range test case #1 /// /// Ranges: