From 32969950a7b5bf1b15fa4508a82c5d3b80b39441 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Sun, 25 Feb 2024 11:12:08 -0800
Subject: [PATCH] `left_overlaps()` bug fix + tests, doc.

 - New `to_tsv()` doc.
 - Fixed issue with `left_overlaps()`.
 - New `left_overlaps()` tests.
 - New example in main doc.
 - New `TsvSerialize` impls.
---
 src/granges.rs        | 67 +++++++++++++++++++++++++++++++++++++------
 src/io/mod.rs         |  4 ++-
 src/io/tsv.rs         | 60 ++++++++++++++++++++++++++++++++++++++
 src/lib.rs            | 67 ++++++++++++++++++++++++++++++++++++++++++-
 src/test_utilities.rs | 16 +++++++++++
 5 files changed, 203 insertions(+), 11 deletions(-)
diff --git a/src/granges.rs b/src/granges.rs
index e4894f1..3c4a93b 100644
--- a/src/granges.rs
+++ b/src/granges.rs
@@ -224,7 +224,13 @@ where
     T: TsvSerialize,
     <T as IndexedDataContainer>::Item<'a>: TsvSerialize,
 {
-    /// Write
+    /// Write this [`GRanges`] object to a TSV file to `output`, using the [`TsvConfig`]
+    /// specified with `config`.
+    ///
+    /// # Arguments
+    /// * `output`: either `None` (for standard out) or file path. If the filepath
+    ///             ends in `.gz`, the output will be gzip-compressed.
+    /// * `config`: a [`TsvConfig`], which contains the TSV output settings.
     fn to_tsv(
         &'a self,
         output: Option<impl Into<PathBuf>>,
@@ -247,7 +253,14 @@ where
 }
 
 impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRangesEmpty<R> {
-    /// Output a BED3 file for for this data-less [`GRanges<R, ()>`].
+    /// Write this [`GRangesEmpty`] object to a TSV file to `output`, using the [`TsvConfig`]
+    /// specified with `config`. Since this [`GRangesEmpty`] contains no data, the output
+    /// will be a BED3 file.
+    ///
+    /// # Arguments
+    /// * `output`: either `None` (for standard out) or file path. If the filepath
+    ///             ends in `.gz`, the output will be gzip-compressed.
+    /// * `config`: a [`TsvConfig`], which contains the TSV output settings.
     fn to_tsv(
         &'a self,
         output: Option<impl Into<PathBuf>>,
@@ -654,7 +667,7 @@ where
 }
 
 /// [`GRanges::left_overlaps()`] for the left with data, right empty case.
-impl<'a, DL: 'a> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRanges<VecRangesEmpty, DL>
+impl<'a, DL: 'a> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRanges<VecRangesIndexed, DL>
 where
     DL: IndexedDataContainer + 'a,
 {
@@ -706,9 +719,8 @@ where
 }
 
 /// [`GRanges::left_overlaps()`] for the left empty, right with data case.
-impl<'a, C, DR: 'a> LeftOverlaps<'a, GRanges<COITreesIndexed, DR>> for GRangesEmpty<C>
+impl<'a, DR: 'a> LeftOverlaps<'a, GRanges<COITreesIndexed, DR>> for GRangesEmpty<VecRangesEmpty>
 where
-    C: IterableRangeContainer,
     DR: IndexedDataContainer + 'a,
 {
     type Output = GRanges<VecRanges<RangeIndexed>, JoinDataLeftEmpty<'a, DR>>;
@@ -759,7 +771,10 @@ where
 }
 
 /// [`GRanges::left_overlaps()`] for the left empty, right empty case.
-impl<'a> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRangesEmpty<COITreesEmpty> {
+impl<'a, C> LeftOverlaps<'a, GRangesEmpty<COITreesEmpty>> for GRangesEmpty<C>
+where
+    C: IterableRangeContainer,
+{
     type Output = GRanges<VecRanges<RangeIndexed>, JoinDataBothEmpty>;
 
     /// Conduct a left overlap join, consuming self and returning a new
@@ -1446,7 +1461,25 @@ mod tests {
     }
 
     #[test]
-    fn test_apply_over_joins() {
+    fn test_left_with_data_right_empty() {
+        let sl = seqlens!("chr1" => 50);
+        let mut left_gr: GRanges<VecRangesIndexed, Vec<f64>> = GRanges::new_vec(&sl);
+        left_gr.push_range("chr1", 1, 2, 1.1).unwrap();
+        left_gr.push_range("chr1", 5, 7, 2.8).unwrap();
+
+        let windows = GRangesEmpty::from_windows(&sl, 10, None, true)
+            .unwrap()
+            .into_coitrees()
+            .unwrap();
+
+        let joined_results = left_gr.left_overlaps(&windows).unwrap();
+
+        // check the joined results
+        assert_eq!(joined_results.len(), 2)
+    }
+
+    #[test]
+    fn test_map_over_joins() {
         let sl = seqlens!("chr1" => 50);
         let windows: GRangesEmpty<VecRangesEmpty> =
             GRangesEmpty::from_windows(&sl, 10, None, true).unwrap();
@@ -1460,8 +1493,24 @@ mod tests {
         let right_gr = right_gr.into_coitrees().unwrap();
 
         // TODO
-        let joined_results = windows.left_overlaps(&right_gr).unwrap();
-        // joined_results.apply_over_joins();
+        let joined_results = windows
+            .left_overlaps(&right_gr)
+            .unwrap()
+            .map_over_joins(|join_data| {
+                let overlap_scores = join_data.right_data;
+                overlap_scores.iter().sum::<f64>()
+            })
+            .unwrap();
+
+        let mut iter = joined_results.iter_ranges();
+        let first_range = iter.next().unwrap();
+        assert_eq!(first_range.start, 0);
+        assert_eq!(first_range.end, 10);
+
+        let mut data_iter = joined_results.data.as_ref().unwrap().iter();
+        assert_eq!(*data_iter.next().unwrap(), 5.0);
+        assert_eq!(*data_iter.next().unwrap(), 0.0);
+        assert_eq!(*data_iter.next().unwrap(), 4.1);
     }
 
     #[test]
diff --git a/src/io/mod.rs b/src/io/mod.rs
index 7362132..93c2234 100644
--- a/src/io/mod.rs
+++ b/src/io/mod.rs
@@ -6,5 +6,7 @@ pub mod tsv;
 
 pub use file::{InputStream, OutputStream};
 pub use parsers::{
-    Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator,
+    Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
+    TsvRecordIterator,
 };
+pub use tsv::BED_TSV;
diff --git a/src/io/tsv.rs b/src/io/tsv.rs
index dc0429f..efac925 100644
--- a/src/io/tsv.rs
+++ b/src/io/tsv.rs
@@ -56,6 +56,66 @@ impl TsvSerialize for &Vec<DatumType> {
     }
 }
 
+impl TsvSerialize for &f64 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        // TODO precision from config
+        format!("{}", self).to_string()
+    }
+}
+
+impl TsvSerialize for f64 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        // TODO precision from config
+        format!("{}", self).to_string()
+    }
+}
+
+impl TsvSerialize for &f32 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        // TODO precision from config
+        format!("{}", self).to_string()
+    }
+}
+
+impl TsvSerialize for f32 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        // TODO precision from config
+        format!("{}", self).to_string()
+    }
+}
+
+impl TsvSerialize for &i64 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        format!("{}", self).to_string()
+    }
+}
+
+impl TsvSerialize for i64 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        format!("{}", self).to_string()
+    }
+}
+
+impl TsvSerialize for &i32 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        format!("{}", self).to_string()
+    }
+}
+
+impl TsvSerialize for i32 {
+    #![allow(unused_variables)]
+    fn to_tsv(&self, config: &TsvConfig) -> String {
+        format!("{}", self).to_string()
+    }
+}
+
 impl TsvSerialize for DatumType {
     fn to_tsv(&self, config: &TsvConfig) -> String {
         match self {
diff --git a/src/lib.rs b/src/lib.rs
index c2b6324..c4b41c4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,6 +18,68 @@
 //! analagous runtime dynamic data structures that would allow the library to be wrapped and used
 //! by other languages.
 //!
+//! ## Example
+//!
+//! The best introduction to the GRanges library is an example illustrating
+//! how a common genomic data processing task can be expressed in few lines
+//! of (relatively simple) code. Below is an annotated implementation of
+//! `granges map`, which provides the same functionality of `bedtools map`:
+//! arbitrary operations (e.g. mean, median, max, etc) are applied to the numeric
+//! scores of all right ranges that overlap a each left range:
+//!
+//! ```
+//! use granges::prelude::*;
+//!
+//! // Data for example:
+//! let genome = seqlens!("chr1" => 100, "chr2" => 100);
+//! let left_path = "tests_data/bedtools/map_a.txt";
+//! let right_path = "tests_data/bedtools/map_b.txt";
+//!
+//! // Read in the "genome file" of chromosomes and their lengths.
+//! let seqnames: Vec<String> = genome.keys().cloned().collect();
+//!
+//! // Create parsing iterators to the left and right BED files.
+//! let left_iter = Bed3Iterator::new(left_path)
+//!                   .expect("error inferring filetype");
+//! let right_iter = Bed5Iterator::new(right_path)
+//!                   .expect("error inferring filetype");
+//!
+//! // Filter out any ranges from chromosomes not in our genome file.
+//! let left_gr = GRangesEmpty::from_iter(left_iter.retain_seqnames(&seqnames), &genome)
+//!                  .expect("error parsing file");
+//! let right_gr = GRanges::from_iter(right_iter.retain_seqnames(&seqnames), &genome)
+//!                  .expect("error parsing file");
+//!
+//!
+//! // Create the "right" GRanges object, convert the ranges to an
+//! // interval trees, and tidy it by selecting out a f64 score.
+//! let right_gr = {
+//!     right_gr
+//!         // Convert to interval trees.
+//!         .into_coitrees().expect("error computing interval trees")
+//!         // Extract out just the score from the additional BED5 columns.
+//!         .map_data(|bed5_cols| {
+//!             bed5_cols.score
+//!         }).expect("error selecting score")
+//! };
+//!
+//! // Find the overlaps by doing a *left grouped join*.
+//! let left_join_gr = left_gr.left_overlaps(&right_gr)
+//!                        .expect("error in computing overlaps");
+//!
+//! // Process all the overlaps.
+//! let result_gr = left_join_gr.map_over_joins(|join_data| {
+//!     // Get the "right data" -- the BED5 scores.
+//!     let overlap_scores = join_data.right_data;
+//!     let score_sum: f64 = overlap_scores.iter().sum();
+//!     score_sum / (overlap_scores.len() as f64)
+//! }).expect("error computing mean score");
+//!
+//! // Write to a TSV file, using the BED TSV format standards
+//! // for missing values, etc.
+//! let path = Some("map_results.bed.gz");
+//! result_gr.to_tsv(path, &BED_TSV).expect("error writing output");
+//! ```
 //!
 //! ## Design Overview
 //!
@@ -151,10 +213,13 @@ pub mod prelude {
     pub use crate::error::GRangesError;
     pub use crate::granges::{GRanges, GRangesEmpty};
     pub use crate::io::file::read_seqlens;
+    pub use crate::io::tsv::BED_TSV;
     pub use crate::io::{
-        Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator,
+        Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
+        TsvRecordIterator,
     };
 
+    pub use crate::data::DatumType;
     pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
     pub use crate::traits::{
         AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations,
diff --git a/src/test_utilities.rs b/src/test_utilities.rs
index b9b6572..23b6e6b 100644
--- a/src/test_utilities.rs
+++ b/src/test_utilities.rs
@@ -138,6 +138,22 @@ pub fn random_bed3file(length: usize) -> NamedTempFile {
     temp_bedfile
 }
 
+/// Create a random BED5 file based on the hg38 sequence lengths, and write to disk.
+///
+/// The feature names are random lowercase characters, and the scores are
+/// random floats.
+pub fn random_bed5file(length: usize) -> NamedTempFile {
+    let temp_bedfile = temp_bedfile();
+    granges_random_bed(
+        "tests_data/hg38_seqlens.tsv",
+        length,
+        Some(temp_bedfile.path()),
+        true,
+    )
+    .expect("could not generate random BED file");
+    temp_bedfile
+}
+
 /// Range test case #1
 ///
 /// Ranges: