diff --git a/Cargo.toml b/Cargo.toml index cd85509..d67871c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ indexmap = "2.2.3" ndarray = "0.15.6" noodles = { version = "0.63.0", features = ["core", "bed"] } rand = "0.8.5" +tempfile = "3.10.0" thiserror = "1.0.57" [features] @@ -38,7 +39,6 @@ path = "src/main/mod.rs" [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } -tempfile = "3.10.0" [[bench]] name = "bedtools_comparison" diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index 2f15393..9b3bd30 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -5,32 +5,17 @@ //! ensure the output is the *exact* same. use criterion::{criterion_group, criterion_main, Criterion}; -use granges::{commands::granges_random_bed, test_utilities::granges_binary_path}; +use granges::test_utilities::{granges_binary_path, random_bedfile}; use std::process::Command; -use tempfile::NamedTempFile; -const BED_LENGTH: u32 = 10_000; - -/// Create a random BED3 file based on the hg38 sequence lengths, and write to disk. -pub fn random_bedfile() -> NamedTempFile { - let temp_file = NamedTempFile::new().expect("Failed to create temp file"); - let random_bedfile_path = temp_file.path().to_path_buf(); - granges_random_bed( - "tests_data/hg38_seqlens.tsv", - BED_LENGTH, - Some(&random_bedfile_path), - true, - ) - .expect("could not generate random BED file"); - temp_file -} +const BED_LENGTH: usize = 10_000; fn bench_range_adjustment(c: &mut Criterion) { // create the benchmark group let mut group = c.benchmark_group("slop vs adjust"); // create the test data - let input_bedfile = random_bedfile(); + let input_bedfile = random_bedfile(BED_LENGTH); // configure the sample size for the group // group.sample_size(10); @@ -68,5 +53,51 @@ fn bench_range_adjustment(c: &mut Criterion) { }); }); } -criterion_group!(benches, bench_range_adjustment); + +fn bench_filter_adjustment(c: &mut Criterion) { + // create the benchmark group + let mut group = c.benchmark_group("intersect vs filter"); + + // create the test data + let random_bedfile_left_tempfile = random_bedfile(BED_LENGTH); + let random_bedfile_right_tempfile = random_bedfile(BED_LENGTH); + let random_bedfile_left = random_bedfile_left_tempfile.path(); + let random_bedfile_right = random_bedfile_right_tempfile.path(); + + // configure the sample size for the group + // group.sample_size(10); + group.bench_function("bedtools_intersect", |b| { + b.iter(|| { + let bedtools_output = Command::new("bedtools") + .arg("intersect") + .arg("-a") + .arg(&random_bedfile_left) + .arg("-b") + .arg(&random_bedfile_right) + .arg("-wa") + .arg("-u") + .output() + .expect("bedtools intersect failed"); + assert!(bedtools_output.status.success()); + }); + }); + + group.bench_function("granges_filter", |b| { + b.iter(|| { + let granges_output = Command::new(granges_binary_path()) + .arg("filter") + .arg("--seqlens") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg(&random_bedfile_left) + .arg("--right") + .arg(&random_bedfile_right) + .output() + .expect("granges adjust failed"); + assert!(granges_output.status.success()); + }); + }); +} + +criterion_group!(benches, bench_filter_adjustment, bench_range_adjustment); criterion_main!(benches); diff --git a/src/commands.rs b/src/commands.rs index 32a1bb9..5ab3948 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use crate::{ - io::{OutputFile, parsers::GenomicRangesParser}, + io::{parsers::GenomicRangesParser, OutputFile}, prelude::*, ranges::operations::adjust_range, reporting::{CommandOutput, Report}, @@ -56,14 +56,14 @@ pub fn granges_adjust( if skipped_ranges > 0 { report.add_issue(format!( - "{} ranges were removed because their widths after adjustment were ≤ 0", - skipped_ranges - )) + "{} ranges were removed because their widths after adjustment were ≤ 0", + skipped_ranges + )) } } } else { - // If we do need to sort, build up a GRanges variant and adjust ranges through - // the GRanges interface. Note we need to detect and build a specific iterator + // If we do need to sort, build up a GRanges variant and adjust ranges through + // the GRanges interface. Note we need to detect and build a specific iterator // for the filetype. let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; @@ -71,19 +71,18 @@ pub fn granges_adjust( GenomicRangesParser::Bed3(iter) => { let gr = GRangesEmpty::from_iter(iter, &genome)?; gr.adjust_ranges(-both, both).to_tsv(output)? - }, + } GenomicRangesParser::Bedlike(iter) => { // Note the call to try_unwrap_data() here: this is because // we know that the records *do* have data. Unwrapping the Option - // values means that writing to TSV doesn't have to deal with this (which + // values means that writing to TSV doesn't have to deal with this (which // always creates headaches). let gr = GRanges::from_iter(iter.try_unwrap_data()?, &genome)?; gr.adjust_ranges(-both, both).to_tsv(output)? - }, + } GenomicRangesParser::Unsupported => { return Err(GRangesError::UnsupportedGenomicRangesFileFormat) - }, - + } } } Ok(CommandOutput::new((), report)) @@ -102,47 +101,65 @@ pub fn granges_filter( let left_iter = GenomicRangesFile::parsing_iterator(left_path)?; let right_iter = GenomicRangesFile::parsing_iterator(right_path)?; - let output_stream = output.as_ref().map_or(OutputFile::new_stdout(None), |file| { - OutputFile::new(file, None) - }); - let mut writer = output_stream.writer()?; - // for reporting stuff to the user let mut report = Report::new(); match (left_iter, right_iter) { (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => { + let left_gr = GRangesEmpty::from_iter(left, &genome)?; + let right_gr = GRangesEmpty::from_iter(right, &genome)?; - let left_gr = GRangesEmpty::from_iter(left, &genome)?; let right_gr = - GRangesEmpty::from_iter(right, &genome)?; - - let right_gr = right_gr.to_coitrees()?; + let right_gr = right_gr.into_coitrees()?; - let intersection = left_gr.filter_overlaps(&right_gr)?; + let mut intersection = left_gr.filter_overlaps(&right_gr)?; intersection.to_tsv(output)?; Ok(CommandOutput::new((), report)) } (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => { + let left_gr = GRangesEmpty::from_iter(left, &genome)?; + let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?; + + let right_gr = right_gr.into_coitrees()?; + + let intersection = left_gr.filter_overlaps(&right_gr)?; + intersection.to_tsv(output)?; + Ok(CommandOutput::new((), report)) } (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => { + let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?; + let right_gr = GRangesEmpty::from_iter(right, &genome)?; + + let right_gr = right_gr.into_coitrees()?; + + let intersection = left_gr.filter_overlaps(&right_gr)?; + intersection.to_tsv(output)?; + Ok(CommandOutput::new((), report)) } (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => { + let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?; + let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?; + + let right_gr = right_gr.into_coitrees()?; + + let intersection = left_gr.filter_overlaps(&right_gr)?; + intersection.to_tsv(output)?; + Ok(CommandOutput::new((), report)) } - _ => Ok(CommandOutput::new((), report)), + _ => Err(GRangesError::UnsupportedGenomicRangesFileFormat), } } /// Generate a random BED-like file with genomic ranges. pub fn granges_random_bed( seqlens: impl Into, - num: u32, + num: usize, output: Option>, sort: bool, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; diff --git a/src/granges.rs b/src/granges.rs index 61da58d..66a017f 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -62,7 +62,7 @@ use genomap::GenomeMap; use indexmap::IndexMap; use crate::{ - io::{OutputFile}, + io::OutputFile, iterators::GRangesIterator, prelude::GRangesError, ranges::{ @@ -71,8 +71,8 @@ use crate::{ GenomicRangeEmptyRecord, GenomicRangeRecord, RangeEmpty, RangeIndexed, }, traits::{ - GenericRange, GenomicRangesTsvSerialize, - IndexedDataContainer, RangeContainer, IterableRangeContainer, TsvSerialize, AsGRangesRef, AdjustableGenericRange, + AdjustableGenericRange, AsGRangesRef, GenericRange, GenomicRangesTsvSerialize, + IndexedDataContainer, IterableRangeContainer, RangeContainer, TsvSerialize, }, Position, PositionOffset, }; @@ -112,7 +112,8 @@ where /// Get the sequences lengths. pub fn seqlens(&self) -> IndexMap { - let seqlens = self.0 + let seqlens = self + .0 .ranges .iter() .map(|(seqname, ranges)| (seqname.to_string(), ranges.sequence_length())) @@ -128,7 +129,7 @@ impl From> for GRanges { } impl<'a, C> AsGRangesRef<'a, C, ()> for GRangesEmpty { - /// Convert a reference to a [`GRangesEmpty`] to a reference to the + /// Convert a reference to a [`GRangesEmpty`] to a reference to the /// underlying [`GRanges`]. This is to greatly improve the ergonomics /// of functions that could take either a [`GRanges`] or [`GRangesEmpty] type. fn as_granges_ref(&'a self) -> &'a GRanges { @@ -139,15 +140,13 @@ impl<'a, C> AsGRangesRef<'a, C, ()> for GRangesEmpty { impl<'a, C, T> AsGRangesRef<'a, C, T> for GRanges { /// Return a reference of a [`GRanges`] object. This is essentially /// a pass-through method. [`IntoGRangesRef`] is not needed in this case, - /// but is needed elsewhere (see the implementation for [`GRangesEmpty`]) to + /// but is needed elsewhere (see the implementation for [`GRangesEmpty`]) to /// improve the ergonomics of working with [`GRanges`] and [`GRangesEmpty`] types. fn as_granges_ref(&'a self) -> &'a GRanges { self } } - - impl GRanges where C: RangeContainer, @@ -251,13 +250,12 @@ impl GRanges, T> { self.ranges.values_mut().for_each(|ranges| ranges.sort()); self } - + pub fn shink(&mut self) { todo!() } } - impl GRanges, T> { /// Adjust all the ranges in this [`GRanges`] object in place. pub fn adjust_ranges(mut self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self { @@ -267,10 +265,9 @@ impl GRanges, T> { self } } - impl GRangesEmpty> { - /// Create a new [`GRangesEmpty`] object, with vector storage for ranges and no + /// Create a new [`GRangesEmpty`] object, with vector storage for ranges and no /// data container. pub fn new_vec(seqlens: &IndexMap) -> Self { GRangesEmpty(GRanges::new_vec(seqlens)) @@ -283,7 +280,7 @@ impl GRangesEmpty> { pub fn shink(&mut self) { todo!() } -} +} impl GRangesEmpty> { pub fn adjust_ranges(self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self { @@ -368,7 +365,8 @@ impl GRangesEmpty> { ) -> Result<(), GRangesError> { // push an unindexed (empty) range let range = RangeEmpty::new(start, end); - let range_container = self.0 + let range_container = self + .0 .ranges .get_mut(seqname) .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; @@ -435,13 +433,14 @@ impl GRangesEmpty { } } - -impl GRangesEmpty -where COITrees<()>: From { +impl GRangesEmpty +where + COITrees<()>: From, +{ /// Convert the [`VecRangesEmpty`] range containers in this [`GRangesEmpty`] to a - /// cache-oblivious interval tree range container, [`COITreesEmpty`]. This is + /// cache-oblivious interval tree range container, [`COITreesEmpty`]. This is /// done using the [`coitrees`] library by Daniel C. Jones. - pub fn to_coitrees(self) -> Result, GRangesError> { + pub fn into_coitrees(self) -> Result, GRangesError> { let old_ranges = self.0.ranges; let mut new_ranges = GenomeMap::new(); for (seqname, vec_ranges) in old_ranges.into_iter() { @@ -456,10 +455,10 @@ where COITrees<()>: From { } impl GRanges, T> { - /// Convert the [`VecRangesIndexed`] range containers in this [`GRanges`] to a - /// cache-oblivious interval tree range container, [`COITreesIndexed`]. This is + /// Convert the [`VecRangesIndexed`] range containers in this [`GRanges`] to a + /// cache-oblivious interval tree range container, [`COITreesIndexed`]. This is /// done using the [`coitrees`] library by Daniel C. Jones. - pub fn to_coitrees(self) -> Result, GRangesError> { + pub fn into_coitrees(self) -> Result, GRangesError> { let old_ranges = self.ranges; let mut new_ranges = GenomeMap::new(); for (seqname, vec_ranges) in old_ranges.into_iter() { @@ -489,7 +488,7 @@ where pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( self, // right: &GRanges, DR>, - right: &'a impl AsGRangesRef<'a, COITrees, DR> + right: &'a impl AsGRangesRef<'a, COITrees, DR>, ) -> Result, GRangesError> { let mut gr = GRangesEmpty::new_vec(&self.seqlens()); @@ -537,15 +536,16 @@ where /// [`GRanges`]. The data container is rebuilt from indices /// into a new [`Vec`] where `U` is the associated type [`IndexedDataContainer::Item`], /// which represents the individual data element in the data container. - pub fn filter_overlaps( + pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( self, - right: &GRanges, + right: &'a impl AsGRangesRef<'a, COITrees, DR>, ) -> Result>, GRangesError> { let mut gr: GRanges> = GRanges::new_vec(&self.seqlens()); + let right_ref = right.as_granges_ref(); for (seqname, left_ranges) in self.ranges.iter() { for left_range in left_ranges.iter_ranges() { - if let Some(right_ranges) = right.ranges.get(seqname) { + if let Some(right_ranges) = right_ref.ranges.get(seqname) { let num_overlaps = right_ranges.count_overlaps(left_range.start(), left_range.end()); if num_overlaps == 0 { @@ -565,11 +565,9 @@ where } } - - impl GRanges where -R: IterableRangeContainer, + R: IterableRangeContainer, { /// Create a new [`GRangesIterator`] to iterate through all the ranges in this [`GRanges`] object. pub fn iter_ranges(&self) -> GRangesIterator<'_, R> { @@ -577,10 +575,9 @@ R: IterableRangeContainer, } } - impl GRangesEmpty where -R: IterableRangeContainer, + R: IterableRangeContainer, { /// Create a new [`GRangesIterator`] to iterate through all the ranges in this [`GRangesEmpty`] object. pub fn iter_ranges(&self) -> GRangesIterator<'_, R> { @@ -588,7 +585,6 @@ R: IterableRangeContainer, } } - #[cfg(test)] mod tests { use indexmap::indexmap; @@ -600,7 +596,7 @@ mod tests { #[test] fn test_new_vec() { - let seqlens = indexmap! { "chr1".to_string() => 10}; + let seqlens = seqlens! { "chr1" => 10}; let mut gr = GRanges::new_vec(&seqlens); gr.push_range("chr1", 0, 10, 1.1).unwrap(); assert_eq!(gr.len(), 1); @@ -615,7 +611,51 @@ mod tests { #[test] fn test_to_coitrees() { let gr_vec = granges_test_case_01(); - let gr = gr_vec.clone().to_coitrees().unwrap(); + let gr = gr_vec.clone().into_coitrees().unwrap(); assert_eq!(gr.len(), 5); } + + #[test] + fn granges_filter_by_overlaps() { + let seqlens = seqlens! { "chr1" => 10}; + + // test 1: one range that overlaps only the first range + let gr = granges_test_case_01(); + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 0, 1).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.filter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 1); + + // test 2: one range that overlaps the first two ranges + let gr = granges_test_case_01(); + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 0, 5).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.filter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 2); + + // test 3: one range that overlaps no ranges + let gr = granges_test_case_01(); + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 8, 9).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.filter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 0); + + // test 4: ranges on two chromosomes: first overlaps two ranges, second + // overlaps one + let gr = granges_test_case_01(); + let seqlens = seqlens! { "chr1" => 10, "chr2" => 10 }; + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 4, 7).unwrap(); + gr_keep.push_range("chr2", 10, 12).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.filter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 3); + } } diff --git a/src/io/mod.rs b/src/io/mod.rs index 99946e3..2b377c4 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -6,4 +6,6 @@ pub mod noodles; pub mod parsers; pub use file::{InputFile, OutputFile}; -pub use parsers::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator}; +pub use parsers::{ + Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator, +}; diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 0f71e21..29a38b4 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -210,27 +210,27 @@ impl GenomicRangesFile { Ok(file_type) } - /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error + /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error /// if the filetype is not supported. - /// - /// This returns a [`GenomicRangesParsers`] enum, since the parsing iterator filetype + /// + /// This returns a [`GenomicRangesParsers`] enum, since the parsing iterator filetype /// cannot be known at compile time. - pub fn parsing_iterator(filepath: impl Clone + Into) -> Result { + pub fn parsing_iterator( + filepath: impl Clone + Into, + ) -> Result { let path = filepath.into(); - dbg!(&path); match Self::detect(path)? { - GenomicRangesFile::Bed3(path) => - Ok(GenomicRangesParser::Bed3(Bed3Iterator::new(path)?)), - GenomicRangesFile::Bedlike(path) => - Ok(GenomicRangesParser::Bedlike(BedlikeIterator::new(path)?)), - GenomicRangesFile::Unsupported => { - Err(GRangesError::UnsupportedGenomicRangesFileFormat) + GenomicRangesFile::Bed3(path) => { + Ok(GenomicRangesParser::Bed3(Bed3Iterator::new(path)?)) + } + GenomicRangesFile::Bedlike(path) => { + Ok(GenomicRangesParser::Bedlike(BedlikeIterator::new(path)?)) } + GenomicRangesFile::Unsupported => Err(GRangesError::UnsupportedGenomicRangesFileFormat), } } } - /// An extensible TSV parser, which uses a supplied parser function to /// convert a line into a [`RangeRecord`], a range with generic associated /// data. @@ -249,7 +249,7 @@ impl std::fmt::Debug for TsvRecordIterator { impl TsvRecordIterator where -F: Fn(&str) -> Result, + F: Fn(&str) -> Result, { /// Create a new [`TsvRecordIterator`], which parses lines from the supplied /// file path into [`RangeRecord`] using the specified parsing function. @@ -270,7 +270,7 @@ F: Fn(&str) -> Result, impl Iterator for TsvRecordIterator where -F: Fn(&str) -> Result, + F: Fn(&str) -> Result, { type Item = Result; @@ -303,7 +303,7 @@ pub struct Bed3Iterator { iter: TsvRecordIterator< fn(&str) -> Result, GenomicRangeEmptyRecord, - >, + >, } impl Bed3Iterator { @@ -338,7 +338,7 @@ pub struct BedlikeIterator { iter: TsvRecordIterator< fn(&str) -> Result>, GRangesError>, GenomicRangeRecord>, - >, + >, } impl BedlikeIterator { @@ -373,7 +373,7 @@ impl BedlikeIterator { /// 2. During iteration, a `None` data element is encountered. pub fn try_unwrap_data( self, - ) -> Result, GRangesError>>, GRangesError> + ) -> Result, GRangesError>>, GRangesError> { if self.number_columns() < 4 { return Err(GRangesError::TooFewColumns)?; @@ -382,11 +382,11 @@ impl BedlikeIterator { result.and_then(|record| { if let Some(data) = record.data { Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - data, - )) + record.seqname, + record.start, + record.end, + data, + )) } else { Err(GRangesError::TryUnwrapDataError) } @@ -401,13 +401,13 @@ impl BedlikeIterator { result .map(|record| { Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - (), - )) + record.seqname, + record.start, + record.end, + (), + )) }) - .unwrap_or_else(Err) // pass through parsing errors + .unwrap_or_else(Err) // pass through parsing errors }) } @@ -425,8 +425,8 @@ impl BedlikeIterator { }) }); Ok(GenomicRangesIteratorVariant::Empty(Box::new( - without_data_iterator, - ))) + without_data_iterator, + ))) } else { let with_data_iterator = self.try_unwrap_data()?.map(|result| { result.map(|record| GenomicRangeRecord { @@ -437,8 +437,8 @@ impl BedlikeIterator { }) }); Ok(GenomicRangesIteratorVariant::WithData(Box::new( - with_data_iterator, - ))) + with_data_iterator, + ))) } } } @@ -481,7 +481,7 @@ impl Iterator for BedlikeIterator { /// ``` pub struct FilteredRanges where -I: Iterator>, + I: Iterator>, { inner: I, retain_seqnames: Option>, @@ -490,13 +490,13 @@ I: Iterator>, impl FilteredRanges where -I: Iterator>, + I: Iterator>, { pub fn new( inner: I, retain_seqnames: Option>, exclude_seqnames: Option>, - ) -> Self { + ) -> Self { let retain_seqnames = retain_seqnames.map(HashSet::from_iter); let exclude_seqnames = exclude_seqnames.map(HashSet::from_iter); Self { @@ -510,7 +510,7 @@ I: Iterator>, /// Range-filtering iterator implementation for [`GenomicRangeRecord`]. impl Iterator for FilteredRanges> where -I: Iterator, GRangesError>>, + I: Iterator, GRangesError>>, { type Item = Result, GRangesError>; @@ -521,18 +521,18 @@ I: Iterator, GRangesError>>, Ok(entry) => { if self .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } if self .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } } Err(_) => return Some(item), } @@ -544,7 +544,7 @@ I: Iterator, GRangesError>>, /// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`]. impl Iterator for FilteredRanges where -I: Iterator>, + I: Iterator>, { type Item = Result; @@ -555,18 +555,18 @@ I: Iterator>, Ok(entry) => { if self .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } if self .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } } Err(_) => return Some(item), } @@ -579,13 +579,13 @@ impl GeneralRangeRecordIterator for Bed3Iterator { fn retain_seqnames( self, seqnames: Vec, - ) -> FilteredRanges { + ) -> FilteredRanges { FilteredRanges::new(self, Some(seqnames), None) } fn exclude_seqnames( self, seqnames: Vec, - ) -> FilteredRanges { + ) -> FilteredRanges { FilteredRanges::new(self, None, Some(seqnames)) } } @@ -609,7 +609,7 @@ impl GeneralRangeRecordIterator for Bed3Iterator { /// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`. pub fn parse_column(column: &str, line: &str) -> Result where -::Err: std::fmt::Debug, + ::Err: std::fmt::Debug, { // NOTE: this is used a lot, and should be benchmarked. column @@ -725,15 +725,15 @@ mod tests { fn test_rangefiletype_detect() { let range_filetype = GenomicRangesFile::detect("tests_data/example.bed"); assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bed3(_) - )); + range_filetype.unwrap(), + GenomicRangesFile::Bed3(_) + )); let range_filetype = GenomicRangesFile::detect("tests_data/example_bedlike.tsv"); assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bedlike(_) - )); + range_filetype.unwrap(), + GenomicRangesFile::Bedlike(_) + )); } #[test] @@ -741,11 +741,11 @@ mod tests { assert_eq!( valid_bedlike(&mut InputFile::new("tests_data/example.bed")).unwrap(), true - ); + ); assert_eq!( valid_bedlike(&mut InputFile::new("tests_data/invalid_format.bed")).unwrap(), false - ); + ); } //#[test] diff --git a/src/iterators.rs b/src/iterators.rs index 0200d72..cb43a48 100644 --- a/src/iterators.rs +++ b/src/iterators.rs @@ -2,7 +2,7 @@ use genomap::GenomeMap; use crate::{ ranges::GenomicRangeIndexedRecord, - traits::{GenericRange, RangeContainer, IterableRangeContainer}, + traits::{GenericRange, IterableRangeContainer, RangeContainer}, }; /// An iterator yielding [`RangeIndexedRecord`], which store diff --git a/src/lib.rs b/src/lib.rs index ff1b3e3..d290463 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,12 +26,14 @@ pub mod prelude { pub use crate::error::GRangesError; pub use crate::granges::{GRanges, GRangesEmpty}; pub use crate::io::file::read_seqlens; - pub use crate::io::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, TsvRecordIterator, GenomicRangesParser}; + pub use crate::io::{ + Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator, + }; pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; pub use crate::traits::{ - GeneralRangeRecordIterator, GenericRange, GenomicRangesTsvSerialize, IndexedDataContainer, - IntoIterableRangesContainer, IterableRangeContainer, TsvSerialize, + AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenomicRangesTsvSerialize, + IndexedDataContainer, IntoIterableRangesContainer, IterableRangeContainer, TsvSerialize, }; pub use crate::seqlens; diff --git a/src/main/mod.rs b/src/main/mod.rs index 368b956..151f45e 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -1,12 +1,10 @@ use std::path::PathBuf; -use clap::{Args, Parser, Subcommand}; +use clap::{Parser, Subcommand}; use granges::{ commands::{granges_adjust, granges_filter}, - io::{parsers::Bed3Iterator, OutputFile}, - prelude::{read_seqlens, GRanges, GRangesError, GenomicRangesFile}, - reporting::{Report, CommandOutput}, - PositionOffset, granges::GRangesEmpty, + prelude::GRangesError, + PositionOffset, }; #[cfg(feature = "dev-commands")] @@ -86,7 +84,7 @@ enum Commands { /// number of random ranges to generate #[arg(long, required = true)] - num: u32, + num: usize, /// an optional output file (standard output will be used if not specified) #[arg(long)] @@ -114,9 +112,7 @@ fn run() -> Result<(), GRangesError> { right, output, sort, - }) => { - granges_filter(seqlens, left, right, output.as_ref(), *sort) - } + }) => granges_filter(seqlens, left, right, output.as_ref(), *sort), #[cfg(feature = "dev-commands")] Some(Commands::RandomBed { seqlens, diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index 0125719..cd4c78b 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -104,9 +104,11 @@ impl RangeContainer for COITrees { /// Convert between [`coitrees::Interval`] with index metadata to a [`RangeEmpty`]. impl From> for RangeEmpty { fn from(value: Interval<&()>) -> Self { + let first: Position = value.first.try_into().unwrap(); + let last: Position = value.last.try_into().unwrap(); RangeEmpty { - start: value.first.try_into().unwrap(), - end: value.last.try_into().unwrap(), + start: first, + end: last + 1, } } } @@ -114,9 +116,11 @@ impl From> for RangeEmpty { /// Convert between [`coitrees::Interval`] with index metadata to a [`RangeIndexed`]. impl From> for RangeIndexed { fn from(value: Interval<&usize>) -> Self { + let first: Position = value.first.try_into().unwrap(); + let last: Position = value.last.try_into().unwrap(); RangeIndexed { - start: value.first.try_into().unwrap(), - end: value.last.try_into().unwrap(), + start: first, + end: last + 1, index: *value.metadata(), } } @@ -151,7 +155,9 @@ impl GenericInterval<()> for RangeEmpty { self.start.try_into().unwrap() } fn last(&self) -> i32 { - self.end.try_into().unwrap() + let end: i32 = self.end.try_into().unwrap(); + // convert right-exclusive to inclusive + end - 1 } fn metadata(&self) -> &() { &() @@ -163,13 +169,17 @@ impl GenericInterval for RangeIndexed { self.start.try_into().unwrap() } fn last(&self) -> i32 { - self.end.try_into().unwrap() + let end: i32 = self.end.try_into().unwrap(); + // convert right-exclusive to inclusive + end - 1 } fn metadata(&self) -> &usize { &self.index } } +// Note: this is predominantly used in the join logic, where overlapping +// ranges as [`coitrees::IntervalNode`] are processed. impl GenericRange for IntervalNode<(), usize> { fn start(&self) -> Position { self.first().try_into().unwrap() @@ -182,6 +192,8 @@ impl GenericRange for IntervalNode<(), usize> { } } +// Note: this is predominantly used in the join logic, where overlapping +// ranges as [`coitrees::IntervalNode`] are processed. impl GenericRange for IntervalNode { fn start(&self) -> Position { self.first().try_into().unwrap() @@ -196,17 +208,53 @@ impl GenericRange for IntervalNode { #[cfg(test)] mod tests { + use coitrees::{GenericInterval, Interval}; + use crate::prelude::*; - use crate::ranges::RangeIndexed; + use crate::ranges::{RangeEmpty, RangeIndexed}; use crate::test_utilities::granges_test_case_01; #[test] fn test_ranges_iterable_coitrees() { - let gr = granges_test_case_01().to_coitrees().unwrap(); + let gr = granges_test_case_01().into_coitrees().unwrap(); let mut chr1_iter = gr.get_ranges("chr1").unwrap().iter_ranges(); assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(0, 5, 0)); assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(4, 7, 1)); assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(10, 17, 2)); assert!(chr1_iter.next().is_none()); } + + #[test] + fn test_from_interval_to_range_empty() { + let interval: Interval<&()> = Interval::new(0, 10, &()); + let range_empty: RangeEmpty = interval.into(); + assert_eq!(range_empty.start, 0); + assert_eq!(range_empty.end, 11); + assert_eq!(range_empty.start(), 0); + assert_eq!(range_empty.end(), 11); + } + + #[test] + fn test_from_interval_to_range_indexed() { + let interval: Interval<&usize> = Interval::new(0, 10, &0); + let range_empty: RangeIndexed = interval.into(); + assert_eq!(range_empty.start, 0); + assert_eq!(range_empty.end, 11); + assert_eq!(range_empty.start(), 0); + assert_eq!(range_empty.end(), 11); + } + + #[test] + fn test_generic_interval_for_range_empty() { + let range = RangeEmpty::new(0, 10); + assert_eq!(range.first(), 0); + assert_eq!(range.last(), 9); + } + + #[test] + fn test_generic_interval_for_range_indexed() { + let range = RangeIndexed::new(0, 10, 1); + assert_eq!(range.first(), 0); + assert_eq!(range.last(), 9); + } } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 281de53..e384454 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -4,7 +4,7 @@ use crate::{ error::GRangesError, - traits::{GenericRange, IndexedDataContainer, TsvSerialize, AdjustableGenericRange}, + traits::{AdjustableGenericRange, GenericRange, IndexedDataContainer, TsvSerialize}, Position, }; @@ -136,8 +136,6 @@ impl AdjustableGenericRange for GenomicRangeRecord { } } - - impl TsvSerialize for GenomicRangeRecord<()> { fn to_tsv(&self) -> String { format!("{}\t{}\t{}", self.seqname, self.start, self.end) @@ -157,7 +155,7 @@ impl TsvSerialize for GenomicRangeRecord> { self.start, self.end, data.to_tsv() - ) + ) } } } @@ -206,7 +204,6 @@ impl GenericRange for GenomicRangeEmptyRecord { } } - impl AdjustableGenericRange for GenomicRangeEmptyRecord { fn set_start(&mut self, start: Position) { self.start = start @@ -239,19 +236,19 @@ impl GenomicRangeIndexedRecord { self, seqnames: &[String], data: Option<&'a T>, - ) -> GenomicRangeRecord>::Item>> - where + ) -> GenomicRangeRecord>::Item>> + where T: IndexedDataContainer<'a> + TsvSerialize, - { - let data = data.and_then(|data_ref| self.index.map(|idx| data_ref.get_value(idx))); - - GenomicRangeRecord { - seqname: seqnames[self.seqname_index].clone(), - start: self.start, - end: self.end, - data, - } + { + let data = data.and_then(|data_ref| self.index.map(|idx| data_ref.get_value(idx))); + + GenomicRangeRecord { + seqname: seqnames[self.seqname_index].clone(), + start: self.start, + end: self.end, + data, } + } pub fn to_record_empty(self, seqnames: &[String]) -> GenomicRangeRecord<()> { GenomicRangeRecord { seqname: seqnames[self.seqname_index].clone(), @@ -297,15 +294,15 @@ pub fn validate_range( start: Position, end: Position, length: Position, - ) -> Result<(), GRangesError> { +) -> Result<(), GRangesError> { if start > end { return Err(GRangesError::InvalidGenomicRange(start, end)); } if end >= length { return Err(GRangesError::InvalidGenomicRangeForSequence( - start, end, length, - )); + start, end, length, + )); } Ok(()) } @@ -319,9 +316,9 @@ mod tests { fn test_invalid_range_start_end() { let result = validate_range(5, 1, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRange(5, 1)) - )); + result, + Err(GRangesError::InvalidGenomicRange(5, 1)) + )); } #[test] @@ -334,9 +331,9 @@ mod tests { fn test_invalid_range_length() { let result = validate_range(1, 10, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) - )); + result, + Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) + )); } #[test] diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index c6803a4..fd746ec 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -2,7 +2,7 @@ //! //! - [`adjust()`]: Adjust range start and end positions. -use crate::{traits::{AdjustableGenericRange}, Position, PositionOffset}; +use crate::{traits::AdjustableGenericRange, Position, PositionOffset}; /// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] /// and returning `None` if the range has zero width after adjustment. diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index 9331d01..1cbcc40 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -1,6 +1,8 @@ use super::operations::adjust_range; use super::{validate_range, RangeEmpty, RangeIndexed}; -use crate::traits::{GenericRange, IntoIterableRangesContainer, IterableRangeContainer, AdjustableGenericRange}; +use crate::traits::{ + AdjustableGenericRange, GenericRange, IntoIterableRangesContainer, IterableRangeContainer, +}; use crate::PositionOffset; use crate::{error::GRangesError, traits::RangeContainer, Position}; diff --git a/src/test_utilities.rs b/src/test_utilities.rs index cd55845..1230549 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -4,18 +4,21 @@ use std::path::PathBuf; use crate::{ + commands::granges_random_bed, create_granges_with_seqlens, error::GRangesError, + granges::GRangesEmpty, prelude::{GRanges, VecRangesIndexed}, ranges::{ coitrees::COITrees, vec::{VecRanges, VecRangesEmpty}, RangeEmpty, }, - Position, granges::GRangesEmpty, + Position, }; use indexmap::IndexMap; use rand::{seq::SliceRandom, thread_rng, Rng}; +use tempfile::{Builder, NamedTempFile}; /// Get the path to the `grange` command line tool after a build. /// This is used for integration tests and benchmarks. @@ -110,6 +113,27 @@ pub fn random_coitrees() -> COITrees<()> { cr } +/// Get a temporary file with a .bed suffix. +pub fn temp_bedfile() -> NamedTempFile { + Builder::new() + .suffix(".bed") + .tempfile() + .expect("Failed to create temp file") +} + +/// Create a random BED3 file based on the hg38 sequence lengths, and write to disk. +pub fn random_bedfile(length: usize) -> NamedTempFile { + let temp_bedfile = temp_bedfile(); + granges_random_bed( + "tests_data/hg38_seqlens.tsv", + length, + Some(temp_bedfile.path()), + true, + ) + .expect("could not generate random BED file"); + temp_bedfile +} + /// Range test case #1 /// /// Ranges: diff --git a/src/traits.rs b/src/traits.rs index 7f3ef7d..e46dafc 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -3,18 +3,15 @@ use std::path::PathBuf; -use crate::{ - error::GRangesError, granges::GRanges, io::parsers::FilteredRanges, Position, -}; +use crate::{error::GRangesError, granges::GRanges, io::parsers::FilteredRanges, Position}; /// Traits for [`GRanges`] types that can be modified. //pub trait GenomicRangesModifiableRanges { // fn adjust_ranges(self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self; //} - /// The [`AsGRangesRef`] trait improves the ergonomics of working -/// with both [`GRanges`] and [`GRangesEmpty`] function arguments. +/// with both [`GRanges`] and [`GRangesEmpty`] function arguments. pub trait AsGRangesRef<'a, C, T> { fn as_granges_ref(&'a self) -> &'a GRanges; } diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index f93778a..bc28518 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -1,16 +1,12 @@ //! Validation against bedtools -use granges::{commands::granges_random_bed, test_utilities::granges_binary_path, prelude::{Bed3Iterator, GenomicRangesFile}}; -use std::process::Command; -use tempfile::{NamedTempFile, Builder}; - - -fn temp_bedfile() -> NamedTempFile { - Builder::new() - .suffix(".bed") - .tempfile() - .expect("Failed to create temp file") -} +use granges::{ + commands::granges_random_bed, + prelude::GenomicRangesFile, + test_utilities::{granges_binary_path, random_bedfile, temp_bedfile}, +}; +use std::{path::Path, process::Command}; +use tempfile::{Builder, NamedTempFile}; #[test] fn test_random_bed3file_filetype_detect() { @@ -21,8 +17,8 @@ fn test_random_bed3file_filetype_detect() { 100_000, Some(&random_bedfile_path), true, - ) - .expect("could not generate random BED file"); + ) + .expect("could not generate random BED file"); match GenomicRangesFile::detect(random_bedfile_path).unwrap() { GenomicRangesFile::Bed3(_) => (), @@ -30,16 +26,18 @@ fn test_random_bed3file_filetype_detect() { } } -#[test] fn test_against_bedtools_slop() { let random_bedfile = temp_bedfile(); let - random_bedfile_path = random_bedfile.path(); +#[test] +fn test_against_bedtools_slop() { + let random_bedfile = temp_bedfile(); + let random_bedfile_path = random_bedfile.path(); granges_random_bed( "tests_data/hg38_seqlens.tsv", 100_000, Some(&random_bedfile_path), true, - ) - .expect("could not generate random BED file"); + ) + .expect("could not generate random BED file"); let width = 10; @@ -71,5 +69,60 @@ fn test_random_bed3file_filetype_detect() { assert_eq!( String::from_utf8_lossy(&bedtools_output.stdout), String::from_utf8_lossy(&granges_output.stdout) - ); + ); +} + +/// Test bedtools intersect -a -b -wa -u +/// against +/// granges filter --seqlens --left --right +#[test] +fn test_against_bedtools_intersect_wa() { + let num_ranges = 1_000_000; + + let random_bedfile_left_tempfile = random_bedfile(num_ranges); + let random_bedfile_right_tempfile = random_bedfile(num_ranges); + let random_bedfile_left = random_bedfile_left_tempfile.path(); + let random_bedfile_right = random_bedfile_right_tempfile.path(); + + // for testing: uncomment and results are local for inspection + // let random_bedfile_left = Path::new("test_left.bed"); + // let random_bedfile_right = Path::new("test_right.bed"); + + granges_random_bed( + "tests_data/hg38_seqlens.tsv", + num_ranges, + Some(&random_bedfile_right), + true, + ) + .expect("could not generate random BED file"); + + let bedtools_output = Command::new("bedtools") + .arg("intersect") + .arg("-a") + .arg(&random_bedfile_left) + .arg("-b") + .arg(&random_bedfile_right) + .arg("-wa") + .arg("-u") + .output() + .expect("bedtools intersect failed"); + + let granges_output = Command::new(granges_binary_path()) + .arg("filter") + .arg("--seqlens") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg(&random_bedfile_left) + .arg("--right") + .arg(&random_bedfile_right) + .output() + .expect("granges adjust failed"); + + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + assert!(granges_output.status.success(), "{:?}", granges_output); + + assert_eq!( + String::from_utf8_lossy(&bedtools_output.stdout), + String::from_utf8_lossy(&granges_output.stdout) + ); }