From b7849fa77568adfd583d2d6f42fff2f449fc2a68 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Tue, 20 Feb 2024 18:44:50 -0800 Subject: [PATCH] Fixed bug with `push_ranges()`, added flanking regions subcommand. - Fixed bug with `push_ranges()` with data, where data was not being added to data container. - Fixed issues with TSV output of `Option`. - Added `flanking_regions()` methods to GRanges, with tests. - Added new trait GenericRangeOperations that implements building flanking regions for new range types. - `ensure_eq!` macro for user-friendly assert_eq statements --- src/commands.rs | 43 +++++++++- src/error.rs | 4 + src/granges.rs | 166 +++++++++++++++++++++++++++++++++++---- src/io/file.rs | 6 ++ src/io/parsers.rs | 9 +-- src/lib.rs | 55 ++++++++++++- src/main/mod.rs | 65 +++++++++++++-- src/ranges/mod.rs | 141 +++++++++++++++++++++++---------- src/ranges/operations.rs | 5 +- src/test_utilities.rs | 17 ++++ src/traits.rs | 34 +++++++- 11 files changed, 469 insertions(+), 76 deletions(-) diff --git a/src/commands.rs b/src/commands.rs index bc9b2f1..9f7cbb3 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -7,12 +7,10 @@ use crate::{ reporting::{CommandOutput, Report}, test_utilities::random_granges, traits::TsvSerialize, - PositionOffset, + Position, PositionOffset, }; /// Adjust the genomic ranges in a bedfile by some specified amount. -// NOTE: we don't do build the full GRanges objects here, for efficiency. -// But it would be a good benchmark to see how much slower that is. pub fn granges_adjust( bedfile: &PathBuf, seqlens: &PathBuf, @@ -95,7 +93,6 @@ pub fn granges_filter( right_path: &PathBuf, output: Option<&PathBuf>, skip_missing: bool, - sort: bool, ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); @@ -195,6 +192,44 @@ pub fn granges_filter( } } +/// Adjust the genomic ranges in a bedfile by some specified amount. +pub fn granges_flank( + seqlens: &PathBuf, + bedfile: &PathBuf, + left: Option, + right: Option, + output: Option<&PathBuf>, + skip_missing: bool, +) -> Result, GRangesError> { + let genome = read_seqlens(seqlens)?; + let seqnames: Vec = genome.keys().cloned().collect(); + let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; + + let report = Report::new(); + match ranges_iter { + GenomicRangesParser::Bed3(iter) => { + let gr = if skip_missing { + GRangesEmpty::from_iter(iter.retain_seqnames(&seqnames), &genome)? + } else { + GRangesEmpty::from_iter(iter, &genome)? + }; + gr.flanking_ranges(left, right)?.to_tsv(output)? + } + GenomicRangesParser::Bedlike(iter) => { + let gr = if skip_missing { + GRanges::from_iter(iter.try_unwrap_data().retain_seqnames(&seqnames), &genome)? + } else { + GRanges::from_iter(iter.try_unwrap_data(), &genome)? + }; + gr.flanking_ranges(left, right)?.to_tsv(output)? + } + GenomicRangesParser::Unsupported => { + return Err(GRangesError::UnsupportedGenomicRangesFileFormat) + } + } + Ok(CommandOutput::new((), report)) +} + /// Generate a random BED-like file with genomic ranges. pub fn granges_random_bed( seqlens: impl Into, diff --git a/src/error.rs b/src/error.rs index 09382ab..46bdd45 100644 --- a/src/error.rs +++ b/src/error.rs @@ -22,6 +22,8 @@ pub enum GRangesError { BedlikeTooFewColumns(String), #[error("File has invalid column type entry: {0}")] InvalidColumnType(String), + #[error("Genome file is invalid: {0}")] + InvalidGenomeFile(String), // BedlikeIterator errors #[error("GenomicRangeRecord encountered with None data in try_unwrap_data()")] @@ -45,4 +47,6 @@ pub enum GRangesError { // Command line tool related errors #[error("Unsupported genomic range format")] UnsupportedGenomicRangesFileFormat, + #[error("Command line argument error: {0}")] + ArgumentError(#[from] clap::error::Error), } diff --git a/src/granges.rs b/src/granges.rs index 66a017f..39846c7 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -56,12 +56,13 @@ //! Because at compile-time, the types *need* to be known, there are a few options here. //! -use std::path::PathBuf; +use std::{collections::HashSet, path::PathBuf}; use genomap::GenomeMap; use indexmap::IndexMap; use crate::{ + ensure_eq, io::OutputFile, iterators::GRangesIterator, prelude::GRangesError, @@ -71,8 +72,9 @@ use crate::{ GenomicRangeEmptyRecord, GenomicRangeRecord, RangeEmpty, RangeIndexed, }, traits::{ - AdjustableGenericRange, AsGRangesRef, GenericRange, GenomicRangesTsvSerialize, - IndexedDataContainer, IterableRangeContainer, RangeContainer, TsvSerialize, + AdjustableGenericRange, AsGRangesRef, GenericRange, GenericRangeOperations, + GenomicRangesTsvSerialize, IndexedDataContainer, IterableRangeContainer, RangeContainer, + TsvSerialize, }, Position, PositionOffset, }; @@ -198,7 +200,7 @@ where let seqnames = self.seqnames(); for range in self.iter_ranges() { - let record = range.to_record(&seqnames, self.data.as_ref()); + let record = range.to_record(&seqnames, self.data.as_ref().unwrap()); writeln!(writer, "{}", record.to_tsv())?; } Ok(()) @@ -266,6 +268,48 @@ impl GRanges, T> { } } +impl GRanges +where + VecRangesIndexed: IterableRangeContainer, + ::RangeType: GenericRangeOperations, +{ + /// Create a new [`GRanges`] object of the flanking ranges of the specified widths. + /// + /// # ⚠️ Warnings + /// This creates an index-only ranges container, meaning the data container is *not* + /// cloned into this object. This is because future version of the GRanges library + /// will have special reference counting based data sharing, to reduce memory overhead. + /// Until then, note that trying to access data elements with these indices may cause + /// panics. + /// + /// # Panics + /// Data accessing operations in the [`GRanges`] object returned by this function + /// will likely cause a panic — see note above. + pub fn flanking_ranges( + &self, + left: Option, + right: Option, + ) -> Result { + let mut gr: GRanges = GRanges::new_vec(&self.seqlens()); + let seqlens = self.seqlens(); + for (seqname, ranges) in self.ranges.iter() { + let seqlen = seqlens.get(seqname).unwrap(); + for range in ranges.iter_ranges() { + let flanking_ranges = range.flanking_ranges::(left, right, *seqlen); + for flanking_range in flanking_ranges { + gr.push_range_with_index( + seqname, + flanking_range.start, + flanking_range.end, + flanking_range.index, + )?; + } + } + } + Ok(gr) + } +} + impl GRangesEmpty> { /// Create a new [`GRangesEmpty`] object, with vector storage for ranges and no /// data container. @@ -288,6 +332,43 @@ impl GRangesEmpty> { } } +impl GRangesEmpty +where + VecRangesEmpty: IterableRangeContainer, + ::RangeType: GenericRangeOperations, +{ + /// Create a new [`GRanges`] object of the flanking ranges of the specified widths. + /// + /// # ⚠️ Warnings + /// This creates an index-only ranges container, meaning the data container is *not* + /// cloned into this object. This is because future version of the GRanges library + /// will have special reference counting based data sharing, to reduce memory overhead. + /// Until then, note that trying to access data elements with these indices may cause + /// panics. + /// + /// # Panics + /// Data accessing operations in the [`GRanges`] object returned by this function + /// will likely cause a panic — see note above. + pub fn flanking_ranges( + &self, + left: Option, + right: Option, + ) -> Result { + let mut gr: GRangesEmpty = GRangesEmpty::new_vec(&self.seqlens()); + let seqlens = self.seqlens(); + for (seqname, ranges) in self.0.ranges.iter() { + let seqlen = seqlens.get(seqname).unwrap(); + for range in ranges.iter_ranges() { + let flanking_ranges = range.flanking_ranges::(left, right, *seqlen); + for flanking_range in flanking_ranges { + gr.push_range(seqname, flanking_range.start, flanking_range.end)?; + } + } + } + Ok(gr) + } +} + impl GRanges> { /// Push a genomic range with its data to the range and data containers in a [`GRanges] object. pub fn push_range( @@ -297,11 +378,13 @@ impl GRanges> { end: Position, data: U, ) -> Result<(), GRangesError> { + if self.data.is_none() { + self.data = Some(Vec::new()); + } // push data to the vec data container, getting the index let index: usize = { - let data_container = self.data.get_or_insert_with(Vec::new); - data_container.push(data); - data_container.len() - 1 // new data index + self.data.as_mut().unwrap().push(data); + self.data.as_mut().unwrap().len() - 1 // new data index }; // push an indexed range let range = RangeIndexed::new(start, end, index); @@ -330,7 +413,7 @@ where let seqnames = self.seqnames(); for range in self.iter_ranges() { - let record = range.to_record(&seqnames, self.data.as_ref()); + let record = range.to_record(&seqnames, self.data.as_ref().unwrap()); writeln!(writer, "{}", record.to_tsv())?; } Ok(()) @@ -537,11 +620,18 @@ where /// into a new [`Vec`] where `U` is the associated type [`IndexedDataContainer::Item`], /// which represents the individual data element in the data container. pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( - self, + mut self, right: &'a impl AsGRangesRef<'a, COITrees, DR>, ) -> Result>, GRangesError> { let mut gr: GRanges> = GRanges::new_vec(&self.seqlens()); + let data = std::mem::take(&mut self.data).unwrap(); + + let mut old_indices = HashSet::new(); // the old indices to *keep* + let mut new_indices = Vec::new(); + + let mut current_index = 0; + let right_ref = right.as_granges_ref(); for (seqname, left_ranges) in self.ranges.iter() { for left_range in left_ranges.iter_ranges() { @@ -555,12 +645,27 @@ where seqname, left_range.start(), left_range.end(), - left_range.index().unwrap(), + current_index, )?; + old_indices.insert(left_range.index().unwrap()); + new_indices.push(current_index); + current_index += 1; } } } } + + // Now, we reconstruct the right data. Note that we do not use the + // standard push_range() method here, which would double the memory + // usage essentially. + let mut new_data = Vec::new(); + for (old_index, data_value) in data.into_iter().enumerate() { + if old_indices.contains(&old_index) { + new_data.push(data_value) + } + } + ensure_eq!(new_data.len(), new_indices.len()); + gr.data = Some(new_data); Ok(gr) } } @@ -587,11 +692,9 @@ where #[cfg(test)] mod tests { - use indexmap::indexmap; - use crate::{ prelude::*, - test_utilities::{granges_test_case_01, random_vecranges}, + test_utilities::{granges_test_case_01, granges_test_case_02, random_vecranges}, }; #[test] @@ -616,7 +719,7 @@ mod tests { } #[test] - fn granges_filter_by_overlaps() { + fn granges_filter_overlaps() { let seqlens = seqlens! { "chr1" => 10}; // test 1: one range that overlaps only the first range @@ -658,4 +761,39 @@ mod tests { let gr_filtered = gr.filter_overlaps(&gr_keep).unwrap(); assert_eq!(gr_filtered.len(), 3); } + + #[test] + fn test_flanking_left() { + let gr = granges_test_case_02(); + let gr_left = gr.flanking_ranges(Some(10), None).unwrap(); + + let mut gr_left_iter = gr_left.iter_ranges(); + let first_range = gr_left_iter.next().unwrap(); + assert_eq!(first_range.start(), 20); + assert_eq!(first_range.end(), 31); + + let second_range = gr_left_iter.next().unwrap(); + assert_eq!(second_range.start(), 90); + assert_eq!(second_range.end(), 101); + } + + #[test] + fn test_flanking_both() { + // Now with right flanks too. + let gr = granges_test_case_02(); + let gr_left = gr.flanking_ranges(Some(10), Some(10)).unwrap(); + + // First range is the new left flank. + let mut gr_left_iter = gr_left.iter_ranges(); + let first_range = gr_left_iter.next().unwrap(); + assert_eq!(first_range.start(), 20); + assert_eq!(first_range.end(), 31); + + // Now the next should be the new right flank. + let second_range = gr_left_iter.next().unwrap(); + assert_eq!(second_range.start(), 49); + + // the end point is the sequence length, since we can only go to end. + assert_eq!(second_range.end(), 50); + } } diff --git a/src/io/file.rs b/src/io/file.rs index 021e4bb..7fff865 100644 --- a/src/io/file.rs +++ b/src/io/file.rs @@ -29,6 +29,12 @@ pub fn read_seqlens( let mut columns = line.split('\t'); let seqname = columns.next().unwrap(); let length: Position = columns.next().unwrap().parse()?; + if seqlens.contains_key(seqname) { + return Err(GRangesError::InvalidGenomeFile(format!( + "sequence '{}' is duplicated", + seqname + ))); + } seqlens.insert(seqname.to_string(), length); } Ok(seqlens) diff --git a/src/io/parsers.rs b/src/io/parsers.rs index bbad5bb..cf20c54 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -715,12 +715,9 @@ pub fn parse_bed3(line: &str) -> Result { #[cfg(test)] mod tests { - use crate::{ - io::{ - parsers::{get_base_extension, valid_bedlike}, - InputFile, - }, - prelude::*, + use crate::io::{ + parsers::{get_base_extension, valid_bedlike}, + InputFile, }; use super::GenomicRangesFile; diff --git a/src/lib.rs b/src/lib.rs index d3d5644..49518af 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,14 +32,19 @@ pub mod prelude { pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; pub use crate::traits::{ - AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenomicRangeRecordUnwrappable, - GenomicRangesTsvSerialize, IndexedDataContainer, IntoIterableRangesContainer, - IterableRangeContainer, TsvSerialize, + AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, + GenomicRangeRecordUnwrappable, GenomicRangesTsvSerialize, IndexedDataContainer, + IntoIterableRangesContainer, IterableRangeContainer, TsvSerialize, }; pub use crate::seqlens; } +pub const INTERNAL_ERROR_MESSAGE: &str = r#" +An internal error has occurred. Please file a GitHub issue at +https://github.com/vsbuffalo/granges/issues +"#; + /// Create and [`indexmap::IndexMap`] of sequence names and their lengths. #[macro_export] macro_rules! seqlens { @@ -68,3 +73,47 @@ macro_rules! create_granges_with_seqlens { } }; } + +/// a version of assert_eq! that is more user-friendly. +#[macro_export] +macro_rules! ensure_eq { + ($left:expr, $right:expr $(,)?) => ({ + match (&$left, &$right) { + (left_val, right_val) => { + if !(*left_val == *right_val) { + panic!("{}\nExpected `{}` but found `{}`.", crate::INTERNAL_ERROR_MESSAGE, stringify!($left), stringify!($right)); + + } + } + } + }); + ($left:expr, $right:expr, $($arg:tt)+) => ({ + match (&$left, &$right) { + (left_val, right_val) => { + if !(*left_val == *right_val) { + panic!("{}\n{}\nExpected `{}` but found `{}`.", crate::INTERNAL_ERROR_MESSAGE, format_args!($($arg)+), stringify!($left), stringify!($right)); + } + } + } + }); +} + +#[cfg(test)] +mod tests { + #[test] + #[should_panic] + fn test_ensure_eq() { + ensure_eq!(1, 2); + } + + #[test] + #[should_panic] + fn test_ensure_eq_msg() { + ensure_eq!(1, 2, "Some description of the problem."); + } + + #[test] + fn test_ensure_eq_pass() { + ensure_eq!(1, 1); + } +} diff --git a/src/main/mod.rs b/src/main/mod.rs index 891faec..b97c451 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -2,9 +2,9 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; use granges::{ - commands::{granges_adjust, granges_filter}, + commands::{granges_adjust, granges_filter, granges_flank}, prelude::GRangesError, - PositionOffset, + Position, PositionOffset, }; #[cfg(feature = "dev-commands")] @@ -57,6 +57,7 @@ enum Commands { /// Sort the ranges after adjusting their start and end positions #[arg(short, long)] sort: bool, + // TODO add skip_missing here }, Filter { /// A TSV genome file of chromosome names and their lengths @@ -75,9 +76,35 @@ enum Commands { #[arg(short, long)] output: Option, - /// Sort the ranges after adjusting their start and end positions + /// Skip ranges from sequences (e.g. chromosomes) not present in the genome file. + /// By default, ranges with sequence names not in the genome file will raise an error. + #[arg(short = 'f', long)] + skip_missing: bool, + }, + Flank { + /// A TSV genome file of chromosome names and their lengths + #[arg(short, long, required = true)] + genome: PathBuf, + + /// An input BED-like TSV file + #[arg(required = true)] + bedfile: PathBuf, + + /// Width (in basepairs) of flank regions to create on both sides of each range #[arg(short, long)] - sort: bool, + both: Option, + + /// Width (in basepairs) of flank regions to create on the left side of each range + #[arg(short, long)] + left: Option, + + /// Width (in basepairs) of flank regions to create on the right side of each range + #[arg(short, long)] + right: Option, + + /// An optional output file (standard output will be used if not specified) + #[arg(short, long)] + output: Option, /// Skip ranges from sequences (e.g. chromosomes) not present in the genome file. /// By default, ranges with sequence names not in the genome file will raise an error. @@ -120,9 +147,35 @@ fn run() -> Result<(), GRangesError> { left, right, output, - sort, skip_missing, - }) => granges_filter(genome, left, right, output.as_ref(), *skip_missing, *sort), + }) => granges_filter(genome, left, right, output.as_ref(), *skip_missing), + Some(Commands::Flank { + genome, + bedfile, + both, + left, + right, + output, + skip_missing, + }) => { + if both.is_some() && (left.is_some() || right.is_some()) { + let error = clap::Error::raw( + clap::error::ErrorKind::ArgumentConflict, + "set either --both, or --left and/or --right", + ); + return Err(error.into()); + } + let left = left.or(*both); + let right = right.or(*both); + if left.is_none() && right.is_none() { + let error = clap::Error::raw( + clap::error::ErrorKind::ArgumentConflict, + "set either --both, or --left and/or --right", + ); + return Err(error.into()); + } + granges_flank(genome, bedfile, left, right, output.as_ref(), *skip_missing) + } #[cfg(feature = "dev-commands")] Some(Commands::RandomBed { genome, diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index e384454..c51ec13 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -4,7 +4,10 @@ use crate::{ error::GRangesError, - traits::{AdjustableGenericRange, GenericRange, IndexedDataContainer, TsvSerialize}, + traits::{ + AdjustableGenericRange, GenericRange, GenericRangeOperations, IndexedDataContainer, + TsvSerialize, + }, Position, }; @@ -41,6 +44,32 @@ impl GenericRange for RangeEmpty { } } +impl GenericRangeOperations for RangeEmpty { + /// Create flanking regions for this [`RangeEmpty`] range. + fn flanking_ranges( + &self, + left_flank: Option, + right_flank: Option, + seqlen: Position, + ) -> Vec { + let mut flanking = Vec::new(); + if let Some(left) = left_flank { + let flank_start = std::cmp::max(self.start.saturating_sub(left), 0); + let flank_end = std::cmp::min(self.start + 1, seqlen); + + let left_flank_region = RangeEmpty::new(flank_start, flank_end); + flanking.push(left_flank_region); + } + if let Some(right) = right_flank { + let flank_start = std::cmp::max(self.end.saturating_sub(1), 0); + let flank_end = std::cmp::min(self.end + right, seqlen); + let right_flank_region = RangeEmpty::new(flank_start, flank_end); + flanking.push(right_flank_region); + } + flanking + } +} + impl AdjustableGenericRange for RangeEmpty { fn set_start(&mut self, start: Position) { self.start = start @@ -82,6 +111,34 @@ impl GenericRange for RangeIndexed { } } +impl GenericRangeOperations for RangeIndexed { + /// Create flanking regions for this [`RangeIndexed`] range. Note that + /// the index **will be copied** to the new [`RangeIndexed`] flanking ranges + /// returned by this method. + fn flanking_ranges( + &self, + left_flank: Option, + right_flank: Option, + seqlen: Position, + ) -> Vec { + let mut flanking = Vec::new(); + if let Some(left) = left_flank { + let flank_start = std::cmp::max(self.start.saturating_sub(left), 0); + let flank_end = std::cmp::min(self.start + 1, seqlen); + + let left_flank_region = RangeIndexed::new(flank_start, flank_end, self.index); + flanking.push(left_flank_region); + } + if let Some(right) = right_flank { + let flank_start = std::cmp::max(self.end.saturating_sub(1), 0); + let flank_end = std::cmp::min(self.end + right, seqlen); + let right_flank_region = RangeIndexed::new(flank_start, flank_end, self.index); + flanking.push(right_flank_region); + } + flanking + } +} + impl AdjustableGenericRange for RangeIndexed { fn set_start(&mut self, start: Position) { self.start = start @@ -142,37 +199,37 @@ impl TsvSerialize for GenomicRangeRecord<()> { } } -impl TsvSerialize for GenomicRangeRecord> { - fn to_tsv(&self) -> String { - match &self.data { - None => { - format!("{}\t{}\t{}", self.seqname, self.start, self.end,) - } - Some(data) => { - format!( - "{}\t{}\t{}\t{}", - self.seqname, - self.start, - self.end, - data.to_tsv() - ) - } - } - } -} -// -// impl TsvSerialize for GenomicRangeRecord { +// impl TsvSerialize for GenomicRangeRecord> { // fn to_tsv(&self) -> String { -// format!( -// "{}\t{}\t{}\t{}", -// self.seqname, -// self.start, -// self.end, -// self.data.to_tsv() -// ) +// match &self.data { +// None => { +// format!("{}\t{}\t{}", self.seqname, self.start, self.end,) +// } +// Some(data) => { +// format!( +// "{}\t{}\t{}\t{}", +// self.seqname, +// self.start, +// self.end, +// data.to_tsv() +// ) +// } +// } // } // } +impl TsvSerialize for GenomicRangeRecord { + fn to_tsv(&self) -> String { + format!( + "{}\t{}\t{}{}", + self.seqname, + self.start, + self.end, + self.data.to_tsv() + ) + } +} + /// Represents a genomic range entry without data, e.g. from a BED3 parser. #[derive(Debug, Clone, PartialEq)] pub struct GenomicRangeEmptyRecord { @@ -235,12 +292,12 @@ impl GenomicRangeIndexedRecord { pub fn to_record<'a, T>( self, seqnames: &[String], - data: Option<&'a T>, - ) -> GenomicRangeRecord>::Item>> + data: &'a T, + ) -> GenomicRangeRecord<>::Item> where T: IndexedDataContainer<'a> + TsvSerialize, { - let data = data.and_then(|data_ref| self.index.map(|idx| data_ref.get_value(idx))); + let data = data.get_value(self.index().unwrap()); GenomicRangeRecord { seqname: seqnames[self.seqname_index].clone(), @@ -345,16 +402,20 @@ mod tests { #[test] fn test_overlap_width() { - // let range_a = RangeEmpty::new(0, 2); - // let range_b = RangeEmpty::new(4, 6); - // assert_eq!(range_a.overlap_width(&range_b), 0); - // - // let range_a = RangeEmpty::new(0, 2); - // let range_b = RangeEmpty::new(2, 6); - // assert_eq!(range_a.overlap_width(&range_b), 0); - // + let range_a = RangeEmpty::new(0, 2); + let range_b = RangeEmpty::new(4, 6); + assert_eq!(range_a.overlap_width(&range_b), 0); + + let range_a = RangeEmpty::new(0, 2); + let range_b = RangeEmpty::new(2, 6); + assert_eq!(range_a.overlap_width(&range_b), 0); + let range_a = RangeEmpty::new(1, 3); let range_b = RangeEmpty::new(2, 5); - assert_eq!(range_a.overlap_width(&range_b), 0); + assert_eq!(range_a.overlap_width(&range_b), 1); + + let range_a = RangeEmpty::new(1, 10); + let range_b = RangeEmpty::new(2, 5); + assert_eq!(range_a.overlap_width(&range_b), 3); } } diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index fd746ec..2107d4c 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -2,7 +2,10 @@ //! //! - [`adjust()`]: Adjust range start and end positions. -use crate::{traits::AdjustableGenericRange, Position, PositionOffset}; +use crate::{ + traits::{AdjustableGenericRange, GenericRange}, + Position, PositionOffset, +}; /// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] /// and returning `None` if the range has zero width after adjustment. diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 1230549..232e14a 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -154,3 +154,20 @@ pub fn granges_test_case_01() -> GRanges> { "chr2" => [(10, 20, 3.7), (18, 32, 1.1)] }, seqlens: { "chr1" => 30, "chr2" => 100 }) } + +/// Range test case #2 +/// +/// Ranges: +/// - chr1: +/// (30, 50, Some(1.1)) +/// - chr2: +/// (100, 200, Some(3.7)) +/// (250, 300, Some(1.1)) +/// +/// Seqlens: { "chr1" => 50, "chr2" => 300 } +pub fn granges_test_case_02() -> GRanges> { + create_granges_with_seqlens!(VecRangesIndexed, Vec, { + "chr1" => [(30, 50, 1.1)], + "chr2" => [(100, 200, 3.7), (250, 300, 1.1)] + }, seqlens: { "chr1" => 50, "chr2" => 300 }) +} diff --git a/src/traits.rs b/src/traits.rs index 3e7325e..bdf2470 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -49,7 +49,10 @@ pub trait GenericRange: Clone { fn overlap_width(&self, other: &R) -> Position { let overlap_start = std::cmp::max(self.start(), other.start()); let overlap_end = std::cmp::min(self.end(), other.end()); - std::cmp::max(overlap_end - overlap_start - 1, 0) + if overlap_start >= overlap_end { + return 0; + } + overlap_end.saturating_sub(overlap_start) } /// Return a tuple of the range created by an overlap with another range; `None` if no overlap. @@ -65,7 +68,27 @@ pub trait GenericRange: Clone { } } -/// The [`AdjustableGenericRange`] trait extends addtion functionality to adjustable generic ranges. +/// The [`GenericRangeOperations`] trait extends additional functionality to [`GenericRange`], +/// such as the creation of flanking regions. +pub trait GenericRangeOperations: GenericRange { + /// Creates the appropriate flanking ranges. + /// + /// If the range has an index, this index will be duplicated for any flanking ranges. This + /// is because it may be the case that the user wishes to use the data for the original + /// range for these flanking ranges. + /// + /// # Stability + /// The interface of this is likely to change, in order to handle + /// flanking regions based on strand. + fn flanking_ranges( + &self, + left_flank: Option, + right_flank: Option, + seqlen: Position, + ) -> Vec; +} + +/// The [`AdjustableGenericRange`] trait extends additional functionality to adjustable generic ranges. pub trait AdjustableGenericRange: GenericRange { /// Set the start to the specified position. fn set_start(&mut self, start: Position); @@ -192,6 +215,13 @@ impl TsvSerialize for String { } } +impl TsvSerialize for Option { + fn to_tsv(&self) -> String { + self.as_ref() + .map_or("".to_string(), |x| format!("\t{}", x.to_tsv())) + } +} + impl TsvSerialize for Vec { fn to_tsv(&self) -> String { self.iter()