From f91a6b635953a17605d793db84d5400becad2360 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Fri, 16 Feb 2024 18:17:28 -0800 Subject: [PATCH] Removed unnecessary generic in RangeIterable, and made it an associated type -- greatly improves design - Removed the generic in the RangeIterable trait. This should have been an associated type, which greatly simplifies the logic and code. - Removed iter_ranges_empty(), which was possible with the above change. - Removed GRangesEmptyIterator, again possible by the change above. - Renamed RangeRecord to GenomicRangeRecord since that's more accurate (has seqname) --- src/granges.rs | 31 +++++----------- src/io/parsers.rs | 30 ++++++++-------- src/iterators.rs | 80 +++++++++--------------------------------- src/ranges/coitrees.rs | 6 ++-- src/ranges/mod.rs | 53 +++++++++++++++++----------- src/ranges/vec.rs | 6 ++-- src/traits.rs | 11 +++--- 7 files changed, 89 insertions(+), 128 deletions(-) diff --git a/src/granges.rs b/src/granges.rs index c6c12d7..0ecd896 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -5,12 +5,12 @@ use indexmap::IndexMap; use crate::{ io::OutputFile, - iterators::{GRangesEmptyIterator, GRangesIterator}, + iterators::GRangesIterator, prelude::GRangesError, ranges::{ coitrees::{COITrees, COITreesIndexed}, vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, - RangeEmpty, RangeIndexed, RangeRecord, + GenomicRangeRecord, RangeEmpty, RangeIndexed, }, traits::{GenericRange, IndexedDataContainer, RangeContainer, RangesIterable, TsvSerialize}, Position, @@ -118,7 +118,7 @@ where let seqnames = self.seqnames(); for range in self.iter_ranges() { - let record = range.to_record(&seqnames, self.data.as_ref().unwrap()); + let record = range.to_record(&seqnames, self.data.as_ref()); writeln!(writer, "{}", record.to_tsv())?; } Ok(()) @@ -150,7 +150,7 @@ impl GRanges> { seqlens: IndexMap, ) -> Result>, GRangesError> where - I: Iterator, GRangesError>>, + I: Iterator, GRangesError>>, { let mut gr = GRanges::new_vec(&seqlens); for possible_entry in iter { @@ -167,7 +167,7 @@ impl GRanges { seqlens: IndexMap, ) -> Result, GRangesError> where - I: Iterator, GRangesError>>, + I: Iterator, GRangesError>>, { let mut gr = GRanges::new_vec(&seqlens); for possible_entry in iter { @@ -180,7 +180,7 @@ impl GRanges { impl GRanges where - R: RangeContainer + RangesIterable, + R: RangeContainer + RangesIterable, { // TODO: candidate for a trait pub fn to_bed3(&self, output: Option>) -> Result<(), GRangesError> { @@ -191,8 +191,8 @@ where let mut writer = output.writer()?; let seqnames = self.seqnames(); - for range in self.iter_ranges_empty() { - let record = range.to_record(&seqnames); + for range in self.iter_ranges() { + let record = range.to_record_empty::<()>(&seqnames); writeln!(writer, "{}", record.to_tsv())?; } Ok(()) @@ -219,20 +219,7 @@ impl GRanges { impl GRanges where - R: RangesIterable, -{ - /// Create a new [`GRangesIterator`] to iterate through all - /// the ranges in this [`GRanges`] object. These ranges carry - /// no data index, unlike the method [`GRanges.iter_ranges()`] - /// available for range type for associated data containers. - pub fn iter_ranges_empty(&self) -> GRangesEmptyIterator<'_, R> { - GRangesEmptyIterator::new(&self.ranges) - } -} - -impl GRanges -where - R: RangesIterable, + R: RangesIterable, { /// Create a new [`GRangesIterator`] to iterate through all the ranges in this [`GRanges`] object. pub fn iter_ranges(&self) -> GRangesIterator<'_, R> { diff --git a/src/io/parsers.rs b/src/io/parsers.rs index b3124da..0cb9ed2 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -8,7 +8,7 @@ use std::path::PathBuf; use crate::error::GRangesError; use crate::io::file::InputFile; -use crate::ranges::RangeRecord; +use crate::ranges::GenomicRangeRecord; use crate::traits::GeneralRangeRecordIterator; use crate::Position; @@ -33,12 +33,12 @@ impl Bed3RecordIterator { } impl Iterator for Bed3RecordIterator { - type Item = Result, GRangesError>; + type Item = Result, GRangesError>; fn next(&mut self) -> Option { self.bed_reader.records::<3>().next().map(|res| { res.map_err(GRangesError::IOError) - .map(|record| RangeRecord { + .map(|record| GenomicRangeRecord { seqname: record.reference_sequence_name().to_string(), start: convert_noodles_position(record.start_position()), end: convert_noodles_position(record.end_position()), @@ -70,7 +70,7 @@ pub struct TsvRecordIterator { impl TsvRecordIterator where - F: Fn(&str) -> Result, GRangesError>, + F: Fn(&str) -> Result, GRangesError>, { /// Create a new [`TsvRecordIterator`], which parses lines from the supplied /// file path into [`RangeRecord`] using the specified parsing function. @@ -91,9 +91,9 @@ where impl Iterator for TsvRecordIterator where - F: Fn(&str) -> Result, GRangesError>, + F: Fn(&str) -> Result, GRangesError>, { - type Item = Result, GRangesError>; + type Item = Result, GRangesError>; fn next(&mut self) -> Option { let mut line = String::new(); @@ -112,12 +112,12 @@ where /// columns, which are standard to all BED files. #[allow(clippy::type_complexity)] pub struct BedlikeIterator { - iter: TsvRecordIterator Result, GRangesError>, String>, + iter: TsvRecordIterator Result, GRangesError>, String>, } impl BedlikeIterator { pub fn new(filepath: impl Into) -> Result { - let parser: fn(&str) -> Result, GRangesError> = parse_bed_lazy; + let parser: fn(&str) -> Result, GRangesError> = parse_bed_lazy; let iter = TsvRecordIterator::new(filepath, parser)?; Ok(Self { iter }) @@ -128,7 +128,7 @@ impl BedlikeIterator { } impl Iterator for BedlikeIterator { - type Item = Result, GRangesError>; + type Item = Result, GRangesError>; fn next(&mut self) -> Option { self.iter.next() @@ -164,7 +164,7 @@ impl Iterator for BedlikeIterator { /// ``` pub struct FilteredIntervals where - I: Iterator, GRangesError>>, + I: Iterator, GRangesError>>, { inner: I, retain_seqnames: Option>, @@ -173,7 +173,7 @@ where impl FilteredIntervals where - I: Iterator, GRangesError>>, + I: Iterator, GRangesError>>, { pub fn new( inner: I, @@ -192,9 +192,9 @@ where impl Iterator for FilteredIntervals where - I: Iterator, GRangesError>>, + I: Iterator, GRangesError>>, { - type Item = Result, GRangesError>; + type Item = Result, GRangesError>; /// Get the next filtered entry, prioritizing exclude over retain. fn next(&mut self) -> Option { @@ -282,7 +282,7 @@ pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str /// Lazily parses a BED* format line into the first three columns defining the range, /// storing the rest as a `String`. -pub fn parse_bed_lazy(line: &str) -> Result, GRangesError> { +pub fn parse_bed_lazy(line: &str) -> Result, GRangesError> { let columns: Vec<&str> = line.splitn(4, '\t').collect(); if columns.len() < 3 { return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); @@ -298,7 +298,7 @@ pub fn parse_bed_lazy(line: &str) -> Result, GRangesError> { String::new() }; - Ok(RangeRecord { + Ok(GenomicRangeRecord { seqname, start, end, diff --git a/src/iterators.rs b/src/iterators.rs index 5b8a13f..b238d46 100644 --- a/src/iterators.rs +++ b/src/iterators.rs @@ -1,8 +1,8 @@ use genomap::GenomeMap; use crate::{ - ranges::{RangeEmpty, RangeEmptyRecord, RangeIndexed, RangeIndexedRecord}, - traits::{RangeContainer, RangesIterable}, + ranges::RangeIndexedRecord, + traits::{GenericRange, RangeContainer, RangesIterable}, }; /// An iterator yielding [`RangeIndexedRecord`], which store @@ -11,15 +11,18 @@ use crate::{ /// # Developer Notes /// Using indices, rather than references to data items and `&str` directly, /// prevents lifetime complexity. -pub struct GRangesIterator<'a, R> { +pub struct GRangesIterator<'a, R> +where + R: RangesIterable, +{ ranges: &'a GenomeMap, current_seqname_index: usize, - current_range_iter: Box + 'a>, + current_range_iter: Box::RangeType> + 'a>, } impl<'a, R> GRangesIterator<'a, R> where - R: RangesIterable, + R: RangesIterable, { pub fn new(ranges: &'a GenomeMap) -> Self { let current_range_iter = ranges.get_by_index(0).unwrap().iter_ranges(); @@ -33,7 +36,8 @@ where impl<'a, R> Iterator for GRangesIterator<'a, R> where - R: RangeContainer + RangesIterable, + R: RangeContainer + RangesIterable, + ::RangeType: GenericRange, { type Item = RangeIndexedRecord; @@ -42,62 +46,9 @@ where if let Some(next_range) = self.current_range_iter.next() { return Some(RangeIndexedRecord { seqname_index: self.current_seqname_index, - start: next_range.start, - end: next_range.end, - index: next_range.index, - }); - } else { - // try to load another sequence's set of ranges. - self.current_seqname_index += 1; - if self.current_seqname_index >= self.ranges.len() { - // we're out of range container iterators - return None; - } - self.current_range_iter = self - .ranges - .get_by_index(self.current_seqname_index) - .unwrap() - .iter_ranges(); - } - } - } -} - -/// An iterator over [`RangeEmptyRecord`], which store -/// indices to the sequence names (but carries no data index). -pub struct GRangesEmptyIterator<'a, R> { - ranges: &'a GenomeMap, - current_seqname_index: usize, - current_range_iter: Box + 'a>, -} - -impl<'a, R> GRangesEmptyIterator<'a, R> -where - R: RangesIterable, -{ - pub fn new(ranges: &'a GenomeMap) -> Self { - let current_range_iter = ranges.get_by_index(0).unwrap().iter_ranges(); - Self { - ranges, - current_seqname_index: 0, - current_range_iter, - } - } -} - -impl<'a, R> Iterator for GRangesEmptyIterator<'a, R> -where - R: RangeContainer + RangesIterable, -{ - type Item = RangeEmptyRecord; - - fn next(&mut self) -> Option { - loop { - if let Some(next_range) = self.current_range_iter.next() { - return Some(RangeEmptyRecord { - seqname_index: self.current_seqname_index, - start: next_range.start, - end: next_range.end, + start: next_range.start(), + end: next_range.end(), + index: next_range.index(), }); } else { // try to load another sequence's set of ranges. @@ -126,6 +77,9 @@ mod tests { fn test_genomic_ranges_iterator() { let gr = granges_test_case_01(); let mut iter = GRangesIterator::new(&gr.ranges); - assert_eq!(iter.next().unwrap(), RangeIndexedRecord::new(0, 0, 5, 0)); + assert_eq!( + iter.next().unwrap(), + RangeIndexedRecord::new(0, 0, 5, Some(0)) + ); } } diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index 78f2cf4..89eef1f 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -136,7 +136,8 @@ impl From> for RangeIndexed { /// Their iterator does not consume, but we need an owned (or copyable, /// as is case here) type to map out. Thus, we need this odd variant of /// an iterator that doesn't return references and does not consume. -impl RangesIterable for COITrees { +impl RangesIterable for COITrees { + type RangeType = RangeIndexed; fn iter_ranges(&self) -> Box + '_> { let iter = self.ranges.iter(); let converted_iter = iter.map(RangeIndexed::from); @@ -144,7 +145,8 @@ impl RangesIterable for COITrees { } } -impl RangesIterable for COITrees<()> { +impl RangesIterable for COITrees<()> { + type RangeType = RangeEmpty; fn iter_ranges(&self) -> Box + '_> { let iter = self.ranges.iter(); let converted_iter = iter.map(RangeEmpty::from); diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 9a7f5c2..e960996 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -83,16 +83,19 @@ impl GenericRange for RangeIndexed { } } -/// Represents a parsed range entry, possibly containing some data. +/// Represents a genomic range entry with some data. +/// +/// This is used as a type for holding a range with associated data directly +/// from a parser. #[derive(Debug, Clone, PartialEq)] -pub struct RangeRecord { +pub struct GenomicRangeRecord { pub seqname: String, pub start: Position, pub end: Position, pub data: U, } -impl RangeRecord { +impl GenomicRangeRecord { pub fn new(seqname: String, start: Position, end: Position, data: U) -> Self { Self { seqname, @@ -103,7 +106,7 @@ impl RangeRecord { } } -impl GenericRange for RangeRecord { +impl GenericRange for GenomicRangeRecord { fn start(&self) -> Position { self.start } @@ -121,13 +124,13 @@ impl GenericRange for RangeRecord { } } -impl TsvSerialize for RangeRecord<()> { +impl TsvSerialize for GenomicRangeRecord<()> { fn to_tsv(&self) -> String { format!("{}\t{}\t{}", self.seqname, self.start, self.end) } } -impl TsvSerialize for RangeRecord> { +impl TsvSerialize for GenomicRangeRecord> { fn to_tsv(&self) -> String { match &self.data { None => { @@ -146,7 +149,7 @@ impl TsvSerialize for RangeRecord> { } } -impl TsvSerialize for RangeRecord { +impl TsvSerialize for GenomicRangeRecord { fn to_tsv(&self) -> String { format!( "{}\t{}\t{}\t{}", @@ -158,15 +161,15 @@ impl TsvSerialize for RangeRecord { } } -/// Represents a range entry without data. +/// Represents a genomic range entry without data. #[derive(Debug, Clone, PartialEq)] -pub struct RangeEmptyRecord { +pub struct GenomicRangeEmptyRecord { pub seqname_index: usize, pub start: Position, pub end: Position, } -impl RangeEmptyRecord { +impl GenomicRangeEmptyRecord { pub fn new(seqname_index: usize, start: Position, end: Position) -> Self { Self { seqname_index, @@ -174,8 +177,8 @@ impl RangeEmptyRecord { end, } } - pub fn to_record(self, seqnames: &[String]) -> RangeRecord<()> { - RangeRecord { + pub fn to_record(self, seqnames: &[String]) -> GenomicRangeRecord<()> { + GenomicRangeRecord { seqname: seqnames[self.seqname_index].clone(), start: self.start, end: self.end, @@ -184,7 +187,7 @@ impl RangeEmptyRecord { } } -impl GenericRange for RangeEmptyRecord { +impl GenericRange for GenomicRangeEmptyRecord { fn start(&self) -> Position { self.start } @@ -208,11 +211,11 @@ pub struct RangeIndexedRecord { pub seqname_index: usize, pub start: Position, pub end: Position, - pub index: usize, + pub index: Option, } impl RangeIndexedRecord { - pub fn new(seqname_index: usize, start: Position, end: Position, index: usize) -> Self { + pub fn new(seqname_index: usize, start: Position, end: Position, index: Option) -> Self { Self { seqname_index, start, @@ -223,16 +226,26 @@ impl RangeIndexedRecord { pub fn to_record<'a, T>( self, seqnames: &[String], - data: &'a T, - ) -> RangeRecord<>::Item> + data: Option<&'a T>, + ) -> GenomicRangeRecord>::Item>> where T: IndexedDataContainer<'a> + TsvSerialize, { - RangeRecord { + let data = data.and_then(|data_ref| self.index.map(|idx| data_ref.get_value(idx))); + + GenomicRangeRecord { seqname: seqnames[self.seqname_index].clone(), start: self.start, end: self.end, - data: data.get_value(self.index), + data, + } + } + pub fn to_record_empty(self, seqnames: &[String]) -> GenomicRangeRecord<()> { + GenomicRangeRecord { + seqname: seqnames[self.seqname_index].clone(), + start: self.start, + end: self.end, + data: (), } } } @@ -245,7 +258,7 @@ impl GenericRange for RangeIndexedRecord { self.end } fn index(&self) -> Option { - Some(self.index) + self.index } fn set_start(&mut self, start: Position) { self.start = start diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index e40c698..c2e5e05 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -66,7 +66,8 @@ impl RangesIntoIterable for VecRanges { } } -impl RangesIterable for VecRanges { +impl RangesIterable for VecRanges { + type RangeType = RangeIndexed; fn iter_ranges(&self) -> Box + '_> { let iter = self.ranges.iter(); // NOTE: RangeIndexed is copyable but there still is overhead here @@ -82,7 +83,8 @@ impl RangesIntoIterable for VecRanges { } } -impl RangesIterable for VecRanges { +impl RangesIterable for VecRanges { + type RangeType = RangeEmpty; fn iter_ranges(&self) -> Box + '_> { let iter = self.ranges.iter(); // NOTE: RangeIndexed is copyable but there still is overhead here diff --git a/src/traits.rs b/src/traits.rs index 0b31aed..4c9688c 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1,7 +1,9 @@ //! Traits used by the GRanges library. //! -use crate::{error::GRangesError, io::parsers::FilteredIntervals, ranges::RangeRecord, Position}; +use crate::{ + error::GRangesError, io::parsers::FilteredIntervals, ranges::GenomicRangeRecord, Position, +}; /// pub trait GenericRange: Clone { @@ -26,7 +28,7 @@ pub trait RangeContainer { /// so these trait methods simplify excluding or retaining ranges based on what /// sequence (i.e. chromosome) they are on. pub trait GeneralRangeRecordIterator: - Iterator, GRangesError>> + Sized + Iterator, GRangesError>> + Sized { fn retain_seqnames(self, seqnames: Vec) -> FilteredIntervals; fn exclude_seqnames(self, seqnames: Vec) -> FilteredIntervals; @@ -34,8 +36,9 @@ pub trait GeneralRangeRecordIterator: /// The [`RangesIterable`] trait defines common functionality for iterating over /// the range types in range containers. -pub trait RangesIterable { - fn iter_ranges(&self) -> Box + '_>; +pub trait RangesIterable { + type RangeType: GenericRange; + fn iter_ranges(&self) -> Box + '_>; } /// The [`RangesIntoIterable`] trait defines common functionality for *consuming* iterating