Skip to content

Commit

Permalink
left_overlaps() bug fix + tests, doc.
Browse files Browse the repository at this point in the history
 - New `to_tsv()` doc.
 - Fixed issue with `left_overlaps()`.
 - New `left_overlaps()` tests.
 - New example in main doc.
 - New `TsvSerialize` impls.
  • Loading branch information
vsbuffalo committed Feb 25, 2024
1 parent a5c3a00 commit 3296995
Show file tree
Hide file tree
Showing 5 changed files with 203 additions and 11 deletions.
67 changes: 58 additions & 9 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,13 @@ where
T: TsvSerialize,
<T as IndexedDataContainer>::Item<'a>: TsvSerialize,
{
/// Write
/// Write this [`GRanges`] object to a TSV file to `output`, using the [`TsvConfig`]
/// specified with `config`.
///
/// # Arguments
/// * `output`: either `None` (for standard out) or file path. If the filepath
/// ends in `.gz`, the output will be gzip-compressed.
/// * `config`: a [`TsvConfig`], which contains the TSV output settings.
fn to_tsv(
&'a self,
output: Option<impl Into<PathBuf>>,
Expand All @@ -247,7 +253,14 @@ where
}

impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRangesEmpty<R> {
/// Output a BED3 file for for this data-less [`GRanges<R, ()>`].
/// Write this [`GRangesEmpty`] object to a TSV file to `output`, using the [`TsvConfig`]
/// specified with `config`. Since this [`GRangesEmpty`] contains no data, the output
/// will be a BED3 file.
///
/// # Arguments
/// * `output`: either `None` (for standard out) or file path. If the filepath
/// ends in `.gz`, the output will be gzip-compressed.
/// * `config`: a [`TsvConfig`], which contains the TSV output settings.
fn to_tsv(
&'a self,
output: Option<impl Into<PathBuf>>,
Expand Down Expand Up @@ -654,7 +667,7 @@ where
}

/// [`GRanges::left_overlaps()`] for the left with data, right empty case.
impl<'a, DL: 'a> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRanges<VecRangesEmpty, DL>
impl<'a, DL: 'a> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRanges<VecRangesIndexed, DL>
where
DL: IndexedDataContainer + 'a,
{
Expand Down Expand Up @@ -706,9 +719,8 @@ where
}

/// [`GRanges::left_overlaps()`] for the left empty, right with data case.
impl<'a, C, DR: 'a> LeftOverlaps<'a, GRanges<COITreesIndexed, DR>> for GRangesEmpty<C>
impl<'a, DR: 'a> LeftOverlaps<'a, GRanges<COITreesIndexed, DR>> for GRangesEmpty<VecRangesEmpty>
where
C: IterableRangeContainer,
DR: IndexedDataContainer + 'a,
{
type Output = GRanges<VecRanges<RangeIndexed>, JoinDataLeftEmpty<'a, DR>>;
Expand Down Expand Up @@ -759,7 +771,10 @@ where
}

/// [`GRanges::left_overlaps()`] for the left empty, right empty case.
impl<'a> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRangesEmpty<COITreesEmpty> {
impl<'a, C> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRangesEmpty<C>
where
C: IterableRangeContainer,
{
type Output = GRanges<VecRanges<RangeIndexed>, JoinDataBothEmpty>;

/// Conduct a left overlap join, consuming self and returning a new
Expand Down Expand Up @@ -1446,7 +1461,25 @@ mod tests {
}

#[test]
fn test_apply_over_joins() {
fn test_left_with_data_right_empty() {
let sl = seqlens!("chr1" => 50);
let mut left_gr: GRanges<VecRangesIndexed, Vec<f64>> = GRanges::new_vec(&sl);
left_gr.push_range("chr1", 1, 2, 1.1).unwrap();
left_gr.push_range("chr1", 5, 7, 2.8).unwrap();

let windows = GRangesEmpty::from_windows(&sl, 10, None, true)
.unwrap()
.into_coitrees()
.unwrap();

let joined_results = left_gr.left_overlaps(&windows).unwrap();

// check the joined results
assert_eq!(joined_results.len(), 2)
}

#[test]
fn test_map_over_joins() {
let sl = seqlens!("chr1" => 50);
let windows: GRangesEmpty<VecRangesEmpty> =
GRangesEmpty::from_windows(&sl, 10, None, true).unwrap();
Expand All @@ -1460,8 +1493,24 @@ mod tests {
let right_gr = right_gr.into_coitrees().unwrap();

// TODO
let joined_results = windows.left_overlaps(&right_gr).unwrap();
// joined_results.apply_over_joins();
let joined_results = windows
.left_overlaps(&right_gr)
.unwrap()
.map_over_joins(|join_data| {
let overlap_scores = join_data.right_data;
overlap_scores.iter().sum::<f64>()
})
.unwrap();

let mut iter = joined_results.iter_ranges();
let first_range = iter.next().unwrap();
assert_eq!(first_range.start, 0);
assert_eq!(first_range.end, 10);

let mut data_iter = joined_results.data.as_ref().unwrap().iter();
assert_eq!(*data_iter.next().unwrap(), 5.0);
assert_eq!(*data_iter.next().unwrap(), 0.0);
assert_eq!(*data_iter.next().unwrap(), 4.1);
}

#[test]
Expand Down
4 changes: 3 additions & 1 deletion src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@ pub mod tsv;

pub use file::{InputStream, OutputStream};
pub use parsers::{
Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator,
Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
TsvRecordIterator,
};
pub use tsv::BED_TSV;
60 changes: 60 additions & 0 deletions src/io/tsv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,66 @@ impl TsvSerialize for &Vec<DatumType> {
}
}

impl TsvSerialize for &f64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for f64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for &f32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for f32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for &i64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}

impl TsvSerialize for i64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}

impl TsvSerialize for &i32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}

impl TsvSerialize for i32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}

impl TsvSerialize for DatumType {
fn to_tsv(&self, config: &TsvConfig) -> String {
match self {
Expand Down
67 changes: 66 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,68 @@
//! analagous runtime dynamic data structures that would allow the library to be wrapped and used
//! by other languages.
//!
//! ## Example
//!
//! The best introduction to the GRanges library is an example illustrating
//! how a common genomic data processing task can be expressed in few lines
//! of (relatively simple) code. Below is an annotated implementation of
//! `granges map`, which provides the same functionality of `bedtools map`:
//! arbitrary operations (e.g. mean, median, max, etc) are applied to the numeric
//! scores of all right ranges that overlap a each left range:
//!
//! ```
//! use granges::prelude::*;
//!
//! // Data for example:
//! let genome = seqlens!("chr1" => 100, "chr2" => 100);
//! let left_path = "tests_data/bedtools/map_a.txt";
//! let right_path = "tests_data/bedtools/map_b.txt";
//!
//! // Read in the "genome file" of chromosomes and their lengths.
//! let seqnames: Vec<String> = genome.keys().cloned().collect();
//!
//! // Create parsing iterators to the left and right BED files.
//! let left_iter = Bed3Iterator::new(left_path)
//! .expect("error inferring filetype");
//! let right_iter = Bed5Iterator::new(right_path)
//! .expect("error inferring filetype");
//!
//! // Filter out any ranges from chromosomes not in our genome file.
//! let left_gr = GRangesEmpty::from_iter(left_iter.retain_seqnames(&seqnames), &genome)
//! .expect("error parsing file");
//! let right_gr = GRanges::from_iter(right_iter.retain_seqnames(&seqnames), &genome)
//! .expect("error parsing file");
//!
//!
//! // Create the "right" GRanges object, convert the ranges to an
//! // interval trees, and tidy it by selecting out a f64 score.
//! let right_gr = {
//! right_gr
//! // Convert to interval trees.
//! .into_coitrees().expect("error computing interval trees")
//! // Extract out just the score from the additional BED5 columns.
//! .map_data(|bed5_cols| {
//! bed5_cols.score
//! }).expect("error selecting score")
//! };
//!
//! // Find the overlaps by doing a *left grouped join*.
//! let left_join_gr = left_gr.left_overlaps(&right_gr)
//! .expect("error in computing overlaps");
//!
//! // Process all the overlaps.
//! let result_gr = left_join_gr.map_over_joins(|join_data| {
//! // Get the "right data" -- the BED5 scores.
//! let overlap_scores = join_data.right_data;
//! let score_sum: f64 = overlap_scores.iter().sum();
//! score_sum / (overlap_scores.len() as f64)
//! }).expect("error computing mean score");
//!
//! // Write to a TSV file, using the BED TSV format standards
//! // for missing values, etc.
//! let path = Some("map_results.bed.gz");
//! result_gr.to_tsv(path, &BED_TSV).expect("error writing output");
//! ```
//!
//! ## Design Overview
//!
Expand Down Expand Up @@ -151,10 +213,13 @@ pub mod prelude {
pub use crate::error::GRangesError;
pub use crate::granges::{GRanges, GRangesEmpty};
pub use crate::io::file::read_seqlens;
pub use crate::io::tsv::BED_TSV;
pub use crate::io::{
Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator,
Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
TsvRecordIterator,
};

pub use crate::data::DatumType;
pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
pub use crate::traits::{
AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations,
Expand Down
16 changes: 16 additions & 0 deletions src/test_utilities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,22 @@ pub fn random_bed3file(length: usize) -> NamedTempFile {
temp_bedfile
}

/// Create a random BED5 file based on the hg38 sequence lengths, and write to disk.
///
/// The feature names are random lowercase characters, and the scores are
/// random floats.
pub fn random_bed5file(length: usize) -> NamedTempFile {
let temp_bedfile = temp_bedfile();
granges_random_bed(
"tests_data/hg38_seqlens.tsv",
length,
Some(temp_bedfile.path()),
true,
)
.expect("could not generate random BED file");
temp_bedfile
}

/// Range test case #1
///
/// Ranges:
Expand Down

0 comments on commit 3296995

Please sign in to comment.