diff --git a/src/commands.rs b/src/commands.rs index 8a656a7..64eae73 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,12 +1,12 @@ use std::path::PathBuf; use crate::{ - io::OutputFile, + io::{OutputFile, parsers::GenomicRangesParsers}, prelude::*, ranges::operations::adjust_range, reporting::{CommandOutput, Report}, test_utilities::random_granges, - traits::{IterableRangeContainer, TsvSerialize}, + traits::TsvSerialize, PositionOffset, }; @@ -22,23 +22,23 @@ pub fn granges_adjust( ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; - // create the parsing iterator, and detect which variant we need based on - // column number of the first entry. - let bedlike_iterator = BedlikeIterator::new(bedfile)?; - - // output stream -- header is None for now (TODO) + // Setup Output stream -- header is None for now (TODO). let output_stream = output.map_or(OutputFile::new_stdout(None), |file| { OutputFile::new(file, None) }); let mut writer = output_stream.writer()?; - // for reporting stuff to the user + // For reporting stuff to the user. let mut report = Report::new(); let mut skipped_ranges = 0; if !sort { - // if we don't need to sort, use iterator-based streaming processing + // Create the parsing iterator, and detect which variant we need based on + // column number of the first entry. + let bedlike_iterator = BedlikeIterator::new(bedfile)?; + + // If we don't need to sort, use iterator-based streaming processing. for record in bedlike_iterator { let range = record?; let seqname = &range.seqname; @@ -56,17 +56,36 @@ pub fn granges_adjust( if skipped_ranges > 0 { report.add_issue(format!( - "{} ranges were removed because their widths after adjustment were ≤ 0", - skipped_ranges - )) + "{} ranges were removed because their widths after adjustment were ≤ 0", + skipped_ranges + )) } } } else { - // if we do need to sort, build up a GRanges variant and adjust ranges that way - // let mut gr = GRanges::from_iter(bedlike_iterator, &genome)?; - // gr.adjust_ranges(-both, both).to_tsv(output)? - } + // If we do need to sort, build up a GRanges variant and adjust ranges through + // the GRanges interface. Note we need to detect and build a specific iterator + // for the filetype. + + let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; + match ranges_iter { + GenomicRangesParsers::Bed3(iter) => { + let gr = GRangesEmpty::from_iter(iter, &genome)?; + gr.adjust_ranges(-both, both).to_tsv(output)? + }, + GenomicRangesParsers::Bedlike(iter) => { + // Note the call to try_unwrap_data() here: this is because + // we know that the records *do* have data. Unwrapping the Option + // values means that writing to TSV doesn't have to deal with this (which + // always creates headaches). + let gr = GRanges::from_iter(iter.try_unwrap_data()?, &genome)?; + gr.adjust_ranges(-both, both).to_tsv(output)? + }, + GenomicRangesParsers::Unsupported => { + return Err(GRangesError::UnsupportedGenomicRangesFileFormat) + }, + } + } Ok(CommandOutput::new((), report)) } // @@ -107,7 +126,7 @@ pub fn granges_random_bed( num: u32, output: Option>, sort: bool, -) -> Result, GRangesError> { + ) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; diff --git a/src/granges.rs b/src/granges.rs index 340d2f8..61da58d 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -62,7 +62,7 @@ use genomap::GenomeMap; use indexmap::IndexMap; use crate::{ - io::{parsers::GenomicRangesIteratorVariant, OutputFile}, + io::{OutputFile}, iterators::GRangesIterator, prelude::GRangesError, ranges::{ @@ -72,7 +72,7 @@ use crate::{ }, traits::{ GenericRange, GenomicRangesTsvSerialize, - IndexedDataContainer, RangeContainer, IterableRangeContainer, TsvSerialize, IntoGRangesRef, + IndexedDataContainer, RangeContainer, IterableRangeContainer, TsvSerialize, AsGRangesRef, AdjustableGenericRange, }, Position, PositionOffset, }; @@ -121,28 +121,28 @@ where } } -impl Into> for GRangesEmpty { - fn into(self) -> GRanges { - self.0 +impl From> for GRanges { + fn from(value: GRangesEmpty) -> Self { + value.0 } } -impl<'a, C> IntoGRangesRef<'a, C, ()> for GRangesEmpty { +impl<'a, C> AsGRangesRef<'a, C, ()> for GRangesEmpty { /// Convert a reference to a [`GRangesEmpty`] to a reference to the /// underlying [`GRanges`]. This is to greatly improve the ergonomics /// of functions that could take either a [`GRanges`] or [`GRangesEmpty] type. - fn into_granges_ref(&'a self) -> &'a GRanges { + fn as_granges_ref(&'a self) -> &'a GRanges { &self.0 } } -impl<'a, C, T> IntoGRangesRef<'a, C, T> for GRanges { +impl<'a, C, T> AsGRangesRef<'a, C, T> for GRanges { /// Return a reference of a [`GRanges`] object. This is essentially /// a pass-through method. [`IntoGRangesRef`] is not needed in this case, /// but is needed elsewhere (see the implementation for [`GRangesEmpty`]) to /// improve the ergonomics of working with [`GRanges`] and [`GRangesEmpty`] types. - fn into_granges_ref(&'a self) -> &'a GRanges { - &self + fn as_granges_ref(&'a self) -> &'a GRanges { + self } } @@ -251,7 +251,14 @@ impl GRanges, T> { self.ranges.values_mut().for_each(|ranges| ranges.sort()); self } + + pub fn shink(&mut self) { + todo!() + } +} + +impl GRanges, T> { /// Adjust all the ranges in this [`GRanges`] object in place. pub fn adjust_ranges(mut self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self { self.ranges @@ -259,12 +266,8 @@ impl GRanges, T> { .for_each(|ranges| ranges.adjust_ranges(start_delta, end_delta)); self } - - pub fn shink(&mut self) { - todo!() - } } - + impl GRangesEmpty> { /// Create a new [`GRangesEmpty`] object, with vector storage for ranges and no @@ -273,19 +276,21 @@ impl GRangesEmpty> { GRangesEmpty(GRanges::new_vec(seqlens)) } - pub fn sort(mut self) -> Self { + pub fn sort(self) -> Self { GRangesEmpty(self.0.sort()) } - pub fn adjust_ranges(mut self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self { - GRangesEmpty(self.0.adjust_ranges(start_delta, end_delta)) - } - pub fn shink(&mut self) { todo!() } } +impl GRangesEmpty> { + pub fn adjust_ranges(self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self { + GRangesEmpty(self.0.adjust_ranges(start_delta, end_delta)) + } +} + impl GRanges> { /// Push a genomic range with its data to the range and data containers in a [`GRanges] object. pub fn push_range( @@ -395,14 +400,14 @@ impl GRanges, T> { impl GRanges> { /// Create a new [`GRanges>`] object from an iterator over /// [`GenomicRangeRecord`] records. - pub fn from_iter_with_data( + pub fn from_iter( iter: I, seqlens: &IndexMap, ) -> Result>, GRangesError> where I: Iterator, GRangesError>>, { - let mut gr = GRanges::new_vec(&seqlens); + let mut gr = GRanges::new_vec(seqlens); for possible_entry in iter { let entry = possible_entry?; gr.push_range(&entry.seqname, entry.start, entry.end, entry.data)?; @@ -421,7 +426,7 @@ impl GRangesEmpty { where I: Iterator>, { - let mut gr = GRangesEmpty::new_vec(&seqlens); + let mut gr = GRangesEmpty::new_vec(seqlens); for possible_entry in iter { let entry = possible_entry?; gr.push_range(&entry.seqname, entry.start, entry.end)?; @@ -484,15 +489,15 @@ where pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( self, // right: &GRanges, DR>, - right: &'a impl IntoGRangesRef<'a, COITrees, DR> + right: &'a impl AsGRangesRef<'a, COITrees, DR> ) -> Result, GRangesError> { let mut gr = GRangesEmpty::new_vec(&self.seqlens()); - let right_ref = right.into_granges_ref(); + let right_ref = right.as_granges_ref(); for (seqname, left_ranges) in self.0.ranges.iter() { for left_range in left_ranges.iter_ranges() { - if let Some(right_ranges) = right_ref.ranges.get(&seqname) { + if let Some(right_ranges) = right_ref.ranges.get(seqname) { let num_overlaps = right_ranges.count_overlaps(left_range.start(), left_range.end()); if num_overlaps == 0 { @@ -507,7 +512,7 @@ where } } -impl<'a, CL, U> GRanges> +impl GRanges> where CL: IterableRangeContainer, { @@ -540,7 +545,7 @@ where for (seqname, left_ranges) in self.ranges.iter() { for left_range in left_ranges.iter_ranges() { - if let Some(right_ranges) = right.ranges.get(&seqname) { + if let Some(right_ranges) = right.ranges.get(seqname) { let num_overlaps = right_ranges.count_overlaps(left_range.start(), left_range.end()); if num_overlaps == 0 { @@ -558,13 +563,6 @@ where } Ok(gr) } - - pub fn filter_overlaps_anti>( - &self, - right: &GRanges, - ) -> Result>, GRangesError> { - todo!() - } } diff --git a/src/io/file.rs b/src/io/file.rs index d73fff7..021e4bb 100644 --- a/src/io/file.rs +++ b/src/io/file.rs @@ -48,7 +48,7 @@ fn is_gzipped_file(file_path: impl Into) -> io::Result { /// This struct is used to handle operations on an input file, such as reading from the file. /// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input /// to be read through a common interface. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug)] pub struct InputFile { pub filepath: PathBuf, pub comments: Option>, diff --git a/src/io/mod.rs b/src/io/mod.rs index 9d73414..e6cbc57 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -6,4 +6,4 @@ pub mod noodles; pub mod parsers; pub use file::{InputFile, OutputFile}; -pub use parsers::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, TsvRecordIterator}; +pub use parsers::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParsers, TsvRecordIterator}; diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 11b288a..a0a97b0 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -80,14 +80,6 @@ use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; use crate::traits::GeneralRangeRecordIterator; use crate::Position; -/// Enum that indicates the filetype of some genomic ranges file. -#[derive(Debug, PartialEq)] -pub enum GenomicRangesFile { - Bed3(PathBuf), - Bedlike(PathBuf), - Unsupported, -} - /// Get the *base* extension to help infer filetype, which ignores compression-related /// extensions (`.gz` and `.bgz`). fn get_base_extension>(filepath: P) -> Option { @@ -101,7 +93,7 @@ fn get_base_extension>(filepath: P) -> Option { .split('.') .collect(); - let ignore_extensions = vec!["gz", "bgz"]; + let ignore_extensions = ["gz", "bgz"]; let has_ignore_extension = parts .last() @@ -131,7 +123,7 @@ fn valid_bedlike(input_file: &mut InputFile) -> Result { reader.read_line(&mut first_line)?; let columns = first_line - .splitn(4, "\t") + .splitn(4, '\t') .map(String::from) .collect::>(); @@ -143,8 +135,6 @@ fn valid_bedlike(input_file: &mut InputFile) -> Result { // Attempt to parse the second and third columns as positions let start_result = columns[1].trim().parse::(); let end_result = columns[2].trim().parse::(); - dbg!(&columns); - dbg!((&start_result, &end_result)); // Check if both positions are valid match (start_result, end_result) { @@ -153,6 +143,22 @@ fn valid_bedlike(input_file: &mut InputFile) -> Result { } } +/// Enum that connects a genomic ranges file type to its specific parser. +#[derive(Debug)] +pub enum GenomicRangesParsers { + Bed3(Bed3Iterator), + Bedlike(BedlikeIterator), + Unsupported, +} + +/// Enum that indicates the filetype of some genomic ranges file. +#[derive(Debug, PartialEq)] +pub enum GenomicRangesFile { + Bed3(PathBuf), + Bedlike(PathBuf), + Unsupported, +} + impl GenomicRangesFile { /// Detect the type of range genomic range file type we are working with, and output /// the appropriate [`GenomicRangesFile`] enum variant. @@ -194,7 +200,6 @@ impl GenomicRangesFile { // test if the first row can be parsed into a BED-like file format. let is_valid_bedlike = valid_bedlike(&mut input_file)?; - dbg!(extension.as_str(), number_columns, is_valid_bedlike); let file_type = match (extension.as_str(), number_columns, is_valid_bedlike) { ("bed", 3, true) => GenomicRangesFile::Bed3(path), ("tsv", 3, true) => GenomicRangesFile::Bed3(path), @@ -205,9 +210,27 @@ impl GenomicRangesFile { Ok(file_type) } - + /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error + /// if the filetype is not supported. + /// + /// This returns a [`GenomicRangesParsers`] enum, since the parsing iterator filetype + /// cannot be known at compile time. + pub fn parsing_iterator(filepath: impl Clone + Into) -> Result { + let path = filepath.into(); + dbg!(&path); + match Self::detect(path)? { + GenomicRangesFile::Bed3(path) => + Ok(GenomicRangesParsers::Bed3(Bed3Iterator::new(path)?)), + GenomicRangesFile::Bedlike(path) => + Ok(GenomicRangesParsers::Bedlike(BedlikeIterator::new(path)?)), + GenomicRangesFile::Unsupported => { + Err(GRangesError::UnsupportedGenomicRangesFileFormat) + } + } + } } + /// An extensible TSV parser, which uses a supplied parser function to /// convert a line into a [`RangeRecord`], a range with generic associated /// data. @@ -218,9 +241,15 @@ pub struct TsvRecordIterator { phantom: PhantomData, } +impl std::fmt::Debug for TsvRecordIterator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TsvRecordIterator").finish_non_exhaustive() + } +} + impl TsvRecordIterator where - F: Fn(&str) -> Result, +F: Fn(&str) -> Result, { /// Create a new [`TsvRecordIterator`], which parses lines from the supplied /// file path into [`RangeRecord`] using the specified parsing function. @@ -241,7 +270,7 @@ where impl Iterator for TsvRecordIterator where - F: Fn(&str) -> Result, +F: Fn(&str) -> Result, { type Item = Result; @@ -269,11 +298,12 @@ pub enum GenomicRangesIteratorVariant { /// string columns to parse. /// #[allow(clippy::type_complexity)] +#[derive(Debug)] pub struct Bed3Iterator { iter: TsvRecordIterator< fn(&str) -> Result, GenomicRangeEmptyRecord, - >, + >, } impl Bed3Iterator { @@ -303,11 +333,12 @@ impl Iterator for Bed3Iterator { /// string columns to parse. /// #[allow(clippy::type_complexity)] +#[derive(Debug)] pub struct BedlikeIterator { iter: TsvRecordIterator< fn(&str) -> Result>, GRangesError>, GenomicRangeRecord>, - >, + >, } impl BedlikeIterator { @@ -342,7 +373,7 @@ impl BedlikeIterator { /// 2. During iteration, a `None` data element is encountered. pub fn try_unwrap_data( self, - ) -> Result, GRangesError>>, GRangesError> + ) -> Result, GRangesError>>, GRangesError> { if self.number_columns() < 4 { return Err(GRangesError::TooFewColumns)?; @@ -351,11 +382,11 @@ impl BedlikeIterator { result.and_then(|record| { if let Some(data) = record.data { Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - data, - )) + record.seqname, + record.start, + record.end, + data, + )) } else { Err(GRangesError::TryUnwrapDataError) } @@ -370,13 +401,13 @@ impl BedlikeIterator { result .map(|record| { Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - (), - )) + record.seqname, + record.start, + record.end, + (), + )) }) - .unwrap_or_else(|e| Err(e)) // pass through parsing errors + .unwrap_or_else(Err) // pass through parsing errors }) } @@ -394,8 +425,8 @@ impl BedlikeIterator { }) }); Ok(GenomicRangesIteratorVariant::Empty(Box::new( - without_data_iterator, - ))) + without_data_iterator, + ))) } else { let with_data_iterator = self.try_unwrap_data()?.map(|result| { result.map(|record| GenomicRangeRecord { @@ -406,8 +437,8 @@ impl BedlikeIterator { }) }); Ok(GenomicRangesIteratorVariant::WithData(Box::new( - with_data_iterator, - ))) + with_data_iterator, + ))) } } } @@ -450,7 +481,7 @@ impl Iterator for BedlikeIterator { /// ``` pub struct FilteredRanges where - I: Iterator>, +I: Iterator>, { inner: I, retain_seqnames: Option>, @@ -459,13 +490,13 @@ where impl FilteredRanges where - I: Iterator>, +I: Iterator>, { pub fn new( inner: I, retain_seqnames: Option>, exclude_seqnames: Option>, - ) -> Self { + ) -> Self { let retain_seqnames = retain_seqnames.map(HashSet::from_iter); let exclude_seqnames = exclude_seqnames.map(HashSet::from_iter); Self { @@ -479,7 +510,7 @@ where /// Range-filtering iterator implementation for [`GenomicRangeRecord`]. impl Iterator for FilteredRanges> where - I: Iterator, GRangesError>>, +I: Iterator, GRangesError>>, { type Item = Result, GRangesError>; @@ -490,18 +521,18 @@ where Ok(entry) => { if self .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } if self .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } } Err(_) => return Some(item), } @@ -513,7 +544,7 @@ where /// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`]. impl Iterator for FilteredRanges where - I: Iterator>, +I: Iterator>, { type Item = Result; @@ -524,18 +555,18 @@ where Ok(entry) => { if self .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } if self .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } } Err(_) => return Some(item), } @@ -548,13 +579,13 @@ impl GeneralRangeRecordIterator for Bed3Iterator { fn retain_seqnames( self, seqnames: Vec, - ) -> FilteredRanges { + ) -> FilteredRanges { FilteredRanges::new(self, Some(seqnames), None) } fn exclude_seqnames( self, seqnames: Vec, - ) -> FilteredRanges { + ) -> FilteredRanges { FilteredRanges::new(self, None, Some(seqnames)) } } @@ -578,7 +609,7 @@ impl GeneralRangeRecordIterator for Bed3Iterator { /// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`. pub fn parse_column(column: &str, line: &str) -> Result where - ::Err: std::fmt::Debug, +::Err: std::fmt::Debug, { // NOTE: this is used a lot, and should be benchmarked. column @@ -694,15 +725,15 @@ mod tests { fn test_rangefiletype_detect() { let range_filetype = GenomicRangesFile::detect("tests_data/example.bed"); assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bed3(_) - )); + range_filetype.unwrap(), + GenomicRangesFile::Bed3(_) + )); let range_filetype = GenomicRangesFile::detect("tests_data/example_bedlike.tsv"); assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bedlike(_) - )); + range_filetype.unwrap(), + GenomicRangesFile::Bedlike(_) + )); } #[test] @@ -710,11 +741,11 @@ mod tests { assert_eq!( valid_bedlike(&mut InputFile::new("tests_data/example.bed")).unwrap(), true - ); + ); assert_eq!( valid_bedlike(&mut InputFile::new("tests_data/invalid_format.bed")).unwrap(), false - ); + ); } //#[test] diff --git a/src/lib.rs b/src/lib.rs index 8d78620..69a940e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,7 @@ pub mod prelude { pub use crate::error::GRangesError; pub use crate::granges::{GRanges, GRangesEmpty}; pub use crate::io::file::read_seqlens; - pub use crate::io::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, TsvRecordIterator}; + pub use crate::io::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, TsvRecordIterator, GenomicRangesParsers}; pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; pub use crate::traits::{ diff --git a/src/main/mod.rs b/src/main/mod.rs index 178550d..2b5abdb 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -116,6 +116,7 @@ fn run() -> Result<(), GRangesError> { sort, }) => { let genome = read_seqlens(seqlens)?; + let mut report = Report::new(); let left_filetype = GenomicRangesFile::detect(left)?; let right_filetype = GenomicRangesFile::detect(right)?; @@ -144,18 +145,15 @@ fn run() -> Result<(), GRangesError> { Ok(CommandOutput::new((), report)) } (GenomicRangesFile::Bed3(left_file), GenomicRangesFile::Bedlike(right_file)) => { - let mut report = Report::new(); Ok(CommandOutput::new((), report)) } (GenomicRangesFile::Bedlike(left_file), GenomicRangesFile::Bed3(right_file)) => { - let mut report = Report::new(); Ok(CommandOutput::new((), report)) } (GenomicRangesFile::Bedlike(left_file), GenomicRangesFile::Bedlike(right_file)) => { - let mut report = Report::new(); Ok(CommandOutput::new((), report)) } - _ => return Err(GRangesError::UnsupportedGenomicRangesFileFormat), + _ => Ok(CommandOutput::new((), report)), } // granges_filter(left, right, output.as_ref(), *sort) } diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index 942f69e..0125719 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -29,7 +29,7 @@ impl std::fmt::Debug for COITrees { f.debug_struct("COITrees") .field("number of ranges:", &self.ranges.len()) .field("length", &self.length) - .finish() + .finish_non_exhaustive() } } @@ -180,12 +180,6 @@ impl GenericRange for IntervalNode<(), usize> { fn index(&self) -> Option { None } - fn set_end(&mut self, end: Position) { - unimplemented!() - } - fn set_start(&mut self, start: Position) { - unimplemented!() - } } impl GenericRange for IntervalNode { @@ -198,12 +192,6 @@ impl GenericRange for IntervalNode { fn index(&self) -> Option { Some(*self.metadata()) } - fn set_end(&mut self, end: Position) { - unimplemented!() - } - fn set_start(&mut self, start: Position) { - unimplemented!() - } } #[cfg(test)] diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 8d720e1..281de53 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -4,7 +4,7 @@ use crate::{ error::GRangesError, - traits::{GenericRange, IndexedDataContainer, TsvSerialize}, + traits::{GenericRange, IndexedDataContainer, TsvSerialize, AdjustableGenericRange}, Position, }; @@ -39,6 +39,9 @@ impl GenericRange for RangeEmpty { fn index(&self) -> Option { None } +} + +impl AdjustableGenericRange for RangeEmpty { fn set_start(&mut self, start: Position) { self.start = start } @@ -77,6 +80,9 @@ impl GenericRange for RangeIndexed { fn index(&self) -> Option { Some(self.index) } +} + +impl AdjustableGenericRange for RangeIndexed { fn set_start(&mut self, start: Position) { self.start = start } @@ -119,6 +125,9 @@ impl GenericRange for GenomicRangeRecord { fn index(&self) -> Option { None } +} + +impl AdjustableGenericRange for GenomicRangeRecord { fn set_start(&mut self, start: Position) { self.start = start } @@ -127,6 +136,8 @@ impl GenericRange for GenomicRangeRecord { } } + + impl TsvSerialize for GenomicRangeRecord<()> { fn to_tsv(&self) -> String { format!("{}\t{}\t{}", self.seqname, self.start, self.end) @@ -146,23 +157,23 @@ impl TsvSerialize for GenomicRangeRecord> { self.start, self.end, data.to_tsv() - ) + ) } } } } - -impl TsvSerialize for GenomicRangeRecord { - fn to_tsv(&self) -> String { - format!( - "{}\t{}\t{}\t{}", - self.seqname, - self.start, - self.end, - self.data.to_tsv() - ) - } -} +// +// impl TsvSerialize for GenomicRangeRecord { +// fn to_tsv(&self) -> String { +// format!( +// "{}\t{}\t{}\t{}", +// self.seqname, +// self.start, +// self.end, +// self.data.to_tsv() +// ) +// } +// } /// Represents a genomic range entry without data, e.g. from a BED3 parser. #[derive(Debug, Clone, PartialEq)] @@ -193,6 +204,10 @@ impl GenericRange for GenomicRangeEmptyRecord { fn index(&self) -> Option { None } +} + + +impl AdjustableGenericRange for GenomicRangeEmptyRecord { fn set_start(&mut self, start: Position) { self.start = start } @@ -224,19 +239,19 @@ impl GenomicRangeIndexedRecord { self, seqnames: &[String], data: Option<&'a T>, - ) -> GenomicRangeRecord>::Item>> - where + ) -> GenomicRangeRecord>::Item>> + where T: IndexedDataContainer<'a> + TsvSerialize, - { - let data = data.and_then(|data_ref| self.index.map(|idx| data_ref.get_value(idx))); + { + let data = data.and_then(|data_ref| self.index.map(|idx| data_ref.get_value(idx))); - GenomicRangeRecord { - seqname: seqnames[self.seqname_index].clone(), - start: self.start, - end: self.end, - data, + GenomicRangeRecord { + seqname: seqnames[self.seqname_index].clone(), + start: self.start, + end: self.end, + data, + } } - } pub fn to_record_empty(self, seqnames: &[String]) -> GenomicRangeRecord<()> { GenomicRangeRecord { seqname: seqnames[self.seqname_index].clone(), @@ -257,6 +272,9 @@ impl GenericRange for GenomicRangeIndexedRecord { fn index(&self) -> Option { self.index } +} + +impl AdjustableGenericRange for GenomicRangeIndexedRecord { fn set_start(&mut self, start: Position) { self.start = start } @@ -279,15 +297,15 @@ pub fn validate_range( start: Position, end: Position, length: Position, -) -> Result<(), GRangesError> { + ) -> Result<(), GRangesError> { if start > end { return Err(GRangesError::InvalidGenomicRange(start, end)); } if end >= length { return Err(GRangesError::InvalidGenomicRangeForSequence( - start, end, length, - )); + start, end, length, + )); } Ok(()) } @@ -301,9 +319,9 @@ mod tests { fn test_invalid_range_start_end() { let result = validate_range(5, 1, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRange(5, 1)) - )); + result, + Err(GRangesError::InvalidGenomicRange(5, 1)) + )); } #[test] @@ -316,9 +334,9 @@ mod tests { fn test_invalid_range_length() { let result = validate_range(1, 10, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) - )); + result, + Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) + )); } #[test] diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index 9559872..c6803a4 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -2,11 +2,11 @@ //! //! - [`adjust()`]: Adjust range start and end positions. -use crate::{traits::GenericRange, Position, PositionOffset}; +use crate::{traits::{AdjustableGenericRange}, Position, PositionOffset}; /// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] /// and returning `None` if the range has zero width after adjustment. -pub fn adjust_range( +pub fn adjust_range( mut range: R, start_delta: PositionOffset, end_delta: PositionOffset, diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index 34019fa..9331d01 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -1,6 +1,6 @@ use super::operations::adjust_range; use super::{validate_range, RangeEmpty, RangeIndexed}; -use crate::traits::{GenericRange, IntoIterableRangesContainer, IterableRangeContainer}; +use crate::traits::{GenericRange, IntoIterableRangesContainer, IterableRangeContainer, AdjustableGenericRange}; use crate::PositionOffset; use crate::{error::GRangesError, traits::RangeContainer, Position}; @@ -53,10 +53,12 @@ impl VecRanges { .then_with(|| a.index().cmp(&b.index())) }); } +} +impl VecRanges { /// Adjust all the ranges in this [`VecRanges`] range container. pub fn adjust_ranges(&mut self, start_delta: PositionOffset, end_delta: PositionOffset) { - let mut ranges = std::mem::replace(&mut self.ranges, Vec::new()); + let mut ranges = std::mem::take(&mut self.ranges); ranges = ranges .into_iter() diff --git a/src/traits.rs b/src/traits.rs index 5775ab0..7f3ef7d 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -4,7 +4,7 @@ use std::path::PathBuf; use crate::{ - error::GRangesError, granges::GRanges, io::parsers::FilteredRanges, Position, ranges::{coitrees::COITrees, vec::VecRanges}, + error::GRangesError, granges::GRanges, io::parsers::FilteredRanges, Position, }; /// Traits for [`GRanges`] types that can be modified. @@ -13,8 +13,10 @@ use crate::{ //} -pub trait IntoGRangesRef<'a, C, T> { - fn into_granges_ref(&'a self) -> &'a GRanges; +/// The [`AsGRangesRef`] trait improves the ergonomics of working +/// with both [`GRanges`] and [`GRangesEmpty`] function arguments. +pub trait AsGRangesRef<'a, C, T> { + fn as_granges_ref(&'a self) -> &'a GRanges; } /// The [`GenomicRangesTsvSerialize`] trait defines how to convert a [`GRanges`] @@ -37,10 +39,8 @@ pub trait GenericRange: Clone { fn start(&self) -> Position; fn end(&self) -> Position; fn index(&self) -> Option; - fn set_start(&mut self, start: Position); - fn set_end(&mut self, end: Position); fn width(&self) -> Position { - return self.end() - self.start() - 1; + self.end() - self.start() - 1 } /// Calculate how many basepairs overlap this range and other. fn overlap_width(&self, other: &R) -> Position { @@ -62,6 +62,14 @@ pub trait GenericRange: Clone { } } +/// The [`AdjustableGenericRange`] trait extends addtion functionality to adjustable generic ranges. +pub trait AdjustableGenericRange: GenericRange { + /// Set the start to the specified position. + fn set_start(&mut self, start: Position); + /// Set the end to the specified position. + fn set_end(&mut self, end: Position); +} + /// Defines functionality common to all range containers, e.g. [`VecRanges`] and /// [`COITrees`]. pub trait RangeContainer { diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 590b4f6..f93778a 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -1,20 +1,45 @@ //! Validation against bedtools -use granges::{commands::granges_random_bed, test_utilities::granges_binary_path}; +use granges::{commands::granges_random_bed, test_utilities::granges_binary_path, prelude::{Bed3Iterator, GenomicRangesFile}}; use std::process::Command; -use tempfile::NamedTempFile; +use tempfile::{NamedTempFile, Builder}; + + +fn temp_bedfile() -> NamedTempFile { + Builder::new() + .suffix(".bed") + .tempfile() + .expect("Failed to create temp file") +} #[test] -fn test_against_bedtools_slop() { - let random_bedfile = NamedTempFile::new().expect("Failed to create temp file"); - let random_bedfile_path = random_bedfile.path().to_path_buf(); +fn test_random_bed3file_filetype_detect() { + let random_bedfile_path = temp_bedfile().path().to_path_buf(); + + granges_random_bed( + "tests_data/hg38_seqlens.tsv", + 100_000, + Some(&random_bedfile_path), + true, + ) + .expect("could not generate random BED file"); + + match GenomicRangesFile::detect(random_bedfile_path).unwrap() { + GenomicRangesFile::Bed3(_) => (), + _ => panic!("could not detect correct filetype"), + } +} + +#[test] fn test_against_bedtools_slop() { let random_bedfile = temp_bedfile(); let + random_bedfile_path = random_bedfile.path(); + granges_random_bed( "tests_data/hg38_seqlens.tsv", 100_000, Some(&random_bedfile_path), true, - ) - .expect("could not generate random BED file"); + ) + .expect("could not generate random BED file"); let width = 10; @@ -25,7 +50,7 @@ fn test_against_bedtools_slop() { .arg("-b") .arg(width.to_string()) .arg("-i") - .arg(random_bedfile.path()) + .arg(&random_bedfile_path) .output() .expect("bedtools slop failed"); @@ -36,16 +61,15 @@ fn test_against_bedtools_slop() { .arg("--both") .arg(width.to_string()) .arg("--sort") - .arg(random_bedfile.path()) + .arg(&random_bedfile_path) .output() .expect("granges adjust failed"); - assert!(bedtools_output.status.success()); - assert!(granges_output.status.success()); + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + assert!(granges_output.status.success(), "{:?}", granges_output); - // TODO - //assert_eq!( - // String::from_utf8_lossy(&bedtools_output.stdout), - // String::from_utf8_lossy(&granges_output.stdout) - //); + assert_eq!( + String::from_utf8_lossy(&bedtools_output.stdout), + String::from_utf8_lossy(&granges_output.stdout) + ); }