From 31b84dbd74367374830a02b9e279eff16cb21428 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Fri, 16 Feb 2024 13:46:46 -0800 Subject: [PATCH] New cleaner handling of BED3 vs BED+ variants, new GenericRange trait, new integration tests against bedtools - new integration testing against bedtools - InputFile can detect column number - GenericRange trait, and simplified adjust_range() function - new iterator methods that more seamlessly handle BED3 BED+ - granges adjust command line function uses enum variant to handle BED3 vs BED+ - new range sorting methods - additional to_tsv methods - added bedtools install to CI --- .github/workflows/rust.yml | 4 + Cargo.toml | 10 ++- src/commands.rs | 107 ++++++++++++++++++++++++ src/data/vec.rs | 2 + src/granges.rs | 37 ++++++++- src/io/file.rs | 19 ++++- src/io/parsers.rs | 15 +++- src/lib.rs | 9 ++- src/main/commands.rs | 73 ----------------- src/main/mod.rs | 22 +++-- src/ranges/mod.rs | 152 ++++++++++++++++++++++++++++++++--- src/ranges/operations.rs | 89 +++----------------- src/ranges/vec.rs | 14 +++- src/{main => }/reporting.rs | 0 src/traits.rs | 17 +++- tests/bedtools_validation.rs | 55 +++++++++++++ 16 files changed, 448 insertions(+), 177 deletions(-) create mode 100644 src/commands.rs delete mode 100644 src/main/commands.rs rename src/{main => }/reporting.rs (100%) create mode 100644 tests/bedtools_validation.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 31000a2..8c2ae72 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -16,6 +16,10 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y bedtools - name: Build run: cargo build --verbose - name: Run tests diff --git a/Cargo.toml b/Cargo.toml index f5c2ce5..5024788 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,10 +26,18 @@ thiserror = "1.0.57" # cli = [ "clap" ] // TODO make feature dev-commands = [ ] +[profile.release] +opt-level = 3 + +[profile.dev] +opt-level = 1 [[bin]] name = "granges" path = "src/main/mod.rs" -# required-features = ["cli"] + +[dev-dependencies] +tempfile = "3.10.0" + diff --git a/src/commands.rs b/src/commands.rs new file mode 100644 index 0000000..52555b5 --- /dev/null +++ b/src/commands.rs @@ -0,0 +1,107 @@ +use std::path::PathBuf; + +use crate::{prelude::*, PositionOffset, reporting::{CommandOutput, Report}, io::OutputFile, ranges::operations::adjust_range, test_utilities::random_granges}; + + +/// Adjust the genomic ranges in a bedfile by some specified amount. +// NOTE: we don't do build the full GRanges objects here, for efficiency. +// But it would be a good benchmark to see how much slower that is. +pub fn granges_adjust( + bedfile: &PathBuf, + seqlens: &PathBuf, + both: PositionOffset, + output: Option<&PathBuf>, + sort: bool, +) -> Result, GRangesError> { + let genome = read_seqlens(seqlens)?; + + // input iterator + let bedlike_iterator = BedlikeIterator::new(bedfile)?; + + // output stream -- header is None for now (TODO) + let output_stream = output.map_or(OutputFile::new_stdout(None), |file| { + OutputFile::new(file, None) + }); + let mut writer = output_stream.writer()?; + + // for reporting stuff to the user + let mut report = Report::new(); + + // if we need to sort, we need to accumulate ranges in memory + enum GRangesVariant { + Indexed(GRanges>), + Empty(GRanges), + } + + let number_columns = bedlike_iterator.number_columns(); + let mut gr = match number_columns { + 3 => GRangesVariant::Empty(GRanges::new_vec(&genome)), + n if n > 3 => GRangesVariant::Indexed(GRanges::new_vec(&genome)), + _ => panic!("Unexpected number of columns"), + }; + + let mut skipped_ranges = 0; + for record in bedlike_iterator { + let range = record?; + let seqname = &range.seqname; + let length = *genome + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + + let possibly_adjusted_range = adjust_range(range, -both, both, length); + + if let Some(range_adjusted) = possibly_adjusted_range { + if !sort { + writer.write_all(&range_adjusted.to_tsv().into_bytes())?; + } else { + // we need to sort, so we build up the appropriate type of GRanges + // object, depending on if we need to hold data or not. + match gr { + GRangesVariant::Empty(ref mut obj) => obj.push_range_empty(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end)?, + GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end, range_adjusted.data)?, + } + } + } else { + skipped_ranges += 1; + } + + if skipped_ranges > 0 { + report.add_issue(format!( + "{} ranges were removed because their widths after adjustment were ≤ 0", + skipped_ranges + )) + } + } + + // if we need to sort the ranges, do that and then output them + if sort { + match gr { + GRangesVariant::Empty(obj) => obj.sort().to_bed3(output)?, + GRangesVariant::Indexed(obj) => obj.sort().to_tsv(output)?, + } + + } + Ok(CommandOutput::new((), report)) +} + +/// Generate a random BED-like file with genomic ranges. +pub fn granges_random_bed( + seqlens: impl Into, + num: u32, + output: Option>, + sort: bool, + ) -> Result, GRangesError> { + // get the genome info + let genome = read_seqlens(seqlens)?; + + let mut gr = random_granges(&genome, num.try_into().unwrap())?; + + if sort { + gr = gr.sort(); + } + + gr.to_bed3(output)?; + + let report = Report::new(); + Ok(CommandOutput::new((), report)) +} diff --git a/src/data/vec.rs b/src/data/vec.rs index a794fe4..fd5185d 100644 --- a/src/data/vec.rs +++ b/src/data/vec.rs @@ -1,5 +1,7 @@ //! Data container implementations for [`Vec`]. +use crate::traits::IndexedDataContainer; + /// Trait methods for the commonly-used `Vec` data container. /// diff --git a/src/granges.rs b/src/granges.rs index 7088610..5f1e6af 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -12,7 +12,7 @@ use crate::{ vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, RangeEmpty, RangeIndexed, RangeRecord, }, - traits::{RangeContainer, RangesIterable, TsvSerialize}, + traits::{RangeContainer, RangesIterable, TsvSerialize, GenericRange, IndexedDataContainer}, Position, }; @@ -47,7 +47,7 @@ where } } -impl GRanges, T> { +impl GRanges, T> { /// Create a new [`GRanges`] object, with vector storage for ranges and data. /// /// This combination of range and data containers is used when loading data into @@ -68,6 +68,16 @@ impl GRanges, T> { } Self { ranges, data: None } } + + /// Consume this [`GRanges`] object and sort the ranges. + pub fn sort(mut self) -> Self { + self.ranges + .values_mut() + .for_each(|ranges| { + ranges.sort() + }); + self + } } impl GRanges> { @@ -96,6 +106,29 @@ impl GRanges> { } } +impl<'a, T> GRanges, T> +where T: IndexedDataContainer<'a>, + T: TsvSerialize, + >::Item: TsvSerialize +{ + /// + pub fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { + // output stream -- header is None for now (TODO) + let output = output.map_or(OutputFile::new_stdout(None), |file| { + OutputFile::new(file, None) + }); + let mut writer = output.writer()?; + + let seqnames = self.seqnames(); + for range in self.iter_ranges() { + let record = range.to_record(&seqnames, self.data.as_ref().unwrap()); + writeln!(writer, "{}", record.to_tsv())?; + } + Ok(()) + } + +} + impl GRanges { /// Push an empty range (no data) to the [`VecRangesEmpty`] range container. pub fn push_range_empty( diff --git a/src/io/file.rs b/src/io/file.rs index 8fc69b7..ab8665c 100644 --- a/src/io/file.rs +++ b/src/io/file.rs @@ -93,7 +93,8 @@ impl InputFile { } /// Collects comment lines and/or a line at the start of the file. - pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result { + pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) + -> io::Result { let mut buf_reader = self.reader()?; let mut comments = Vec::new(); let mut line = String::new(); @@ -122,6 +123,20 @@ impl InputFile { Ok(self.skip_lines > 0) } + + /// Detect the number of columns *from the first line*, according to some delimiter. + /// This is not robust against ragged delimited data formats. + pub fn detect_columns(&mut self, delim: &str) -> Result { + let mut skipped_lines = 0; + let mut buf_reader = self.reader()?; + while skipped_lines < self.skip_lines { + skipped_lines += 1; + } + let mut line = String::new(); + buf_reader.read_line(&mut line)?; + Ok(line.split(delim).count()) + } + /// Method to continue reading after skipping the comment and header lines. pub fn continue_reading(&self) -> io::Result>> { let mut buf_reader = self.reader()?; @@ -197,7 +212,7 @@ impl OutputFile { Box::new(BufWriter::new(File::create(path)?)) } } - OutputDestination::Stdout => Box::new(io::stdout()), + OutputDestination::Stdout => Box::new(BufWriter::new(io::stdout())), }; // write header if one is set if let Some(entries) = &self.header { diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 7b3faf8..b3124da 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -58,8 +58,12 @@ impl GeneralRangeRecordIterator<()> for Bed3RecordIterator { } } +/// An extensible TSV parser, which uses a supplied parser function to +/// convert a line into a [`RangeRecord`], a range with generic associated +/// data. pub struct TsvRecordIterator { reader: BufReader>, + num_columns: usize, parser: F, phantom: std::marker::PhantomData, } @@ -68,12 +72,17 @@ impl TsvRecordIterator where F: Fn(&str) -> Result, GRangesError>, { + /// Create a new [`TsvRecordIterator`], which parses lines from the supplied + /// file path into [`RangeRecord`] using the specified parsing function. pub fn new(filepath: impl Into, parser: F) -> Result { - let input_file = InputFile::new(filepath); + let mut input_file = InputFile::new(filepath); + let _has_metadata = input_file.collect_metadata("#", None); + let num_columns = input_file.detect_columns("\t")?; let reader = input_file.continue_reading()?; Ok(Self { reader, + num_columns, parser, phantom: std::marker::PhantomData, }) @@ -108,12 +117,14 @@ pub struct BedlikeIterator { impl BedlikeIterator { pub fn new(filepath: impl Into) -> Result { - // Wrap the parse_bedlike_to_range_record function to conform with TsvRecordIterator's expectations. let parser: fn(&str) -> Result, GRangesError> = parse_bed_lazy; let iter = TsvRecordIterator::new(filepath, parser)?; Ok(Self { iter }) } + pub fn number_columns(&self) -> usize { + self.iter.num_columns + } } impl Iterator for BedlikeIterator { diff --git a/src/lib.rs b/src/lib.rs index 19b1484..d39aa23 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,12 +7,18 @@ pub use indexmap; pub mod error; pub mod granges; pub mod io; +pub mod data; pub mod iterators; pub mod join; pub mod ranges; pub mod test_utilities; pub mod traits; +// bringing these CLI modules into lib.rs rather than main/ allows for +// use in integration tests and other Rust-side command line work +pub mod commands; +pub mod reporting; + pub type Position = u32; pub type PositionOffset = i32; // signed variant @@ -24,7 +30,8 @@ pub mod prelude { pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; pub use crate::traits::{ - GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable, TsvSerialize, + IndexedDataContainer, + GeneralRangeRecordIterator, GenericRange, RangesIntoIterable, RangesIterable, TsvSerialize, }; pub use crate::seqlens; diff --git a/src/main/commands.rs b/src/main/commands.rs deleted file mode 100644 index 56dd90a..0000000 --- a/src/main/commands.rs +++ /dev/null @@ -1,73 +0,0 @@ -use std::path::PathBuf; - -use granges::io::OutputFile; -use granges::ranges::operations::adjust_range_record; -use granges::test_utilities::random_granges; -use granges::{prelude::*, PositionOffset}; - -use crate::reporting::{CommandOutput, Report}; - -/// Adjust the genomic ranges in a bedfile by some specified amount. -// NOTE: we don't do build the full GRanges objects here, for efficiency. -// But it would be a good benchmark to see how much slower that is. -pub fn granges_adjust( - bedfile: &PathBuf, - seqlens: &PathBuf, - both: PositionOffset, - output: Option<&PathBuf>, -) -> Result, GRangesError> { - let genome = read_seqlens(seqlens)?; - - // input iterator - let bedlike_iterator = BedlikeIterator::new(bedfile)?; - - // output stream -- header is None for now (TODO) - let output = output.map_or(OutputFile::new_stdout(None), |file| { - OutputFile::new(file, None) - }); - let mut writer = output.writer()?; - - // for reporting stuff to the user - let mut report = Report::new(); - - let mut skipped_ranges = 0; - for record in bedlike_iterator { - let range = record?; - let seqname = &range.seqname; - let length = *genome - .get(seqname) - .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; - - let possibly_adjusted_range = adjust_range_record(range, -both, both, length); - if let Some(range_adjusted) = possibly_adjusted_range { - writeln!(writer, "{}", range_adjusted.to_tsv())?; - } else { - skipped_ranges += 1; - } - - if skipped_ranges > 0 { - report.add_issue(format!( - "{} ranges were removed because their widths after adjustment were ≤ 0", - skipped_ranges - )) - } - } - Ok(CommandOutput::new((), report)) -} - -/// Generate a random BED-like file with genomic ranges. -pub fn granges_random_bed( - seqlens: &PathBuf, - num: u32, - output: Option<&PathBuf>, -) -> Result, GRangesError> { - // get the genome info - let genome = read_seqlens(seqlens)?; - - let gr = random_granges(&genome, num.try_into().unwrap())?; - - gr.to_bed3(output)?; - - let report = Report::new(); - Ok(CommandOutput::new((), report)) -} diff --git a/src/main/mod.rs b/src/main/mod.rs index ec27124..8709632 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -1,12 +1,10 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; -use commands::granges_random_bed; -use granges::{prelude::GRangesError, PositionOffset}; +use granges::{prelude::GRangesError, PositionOffset, commands::{granges_adjust}}; -pub mod commands; -pub mod reporting; -use crate::commands::granges_adjust; +#[cfg(feature = "dev-commands")] +use granges::commands::granges_random_bed; const INFO: &str = "\ granges: genomic range operations built off of the GRanges library @@ -44,6 +42,10 @@ enum Commands { /// an optional output file (standard output will be used if not specified) #[arg(long)] output: Option, + /// sort the ranges after adjusting their start and end positions + #[arg(long)] + sort: bool, + }, #[cfg(feature = "dev-commands")] @@ -57,6 +59,9 @@ enum Commands { /// an optional output file (standard output will be used if not specified) #[arg(long)] output: Option, + /// sort the ranges + #[arg(long)] + sort: bool, }, } @@ -68,14 +73,17 @@ fn run() -> Result<(), GRangesError> { seqlens, both, output, - }) => granges_adjust(bedfile, seqlens, *both, output.as_ref()), + sort, + }) => granges_adjust(bedfile, seqlens, *both, output.as_ref(), *sort), #[cfg(feature = "dev-commands")] Some(Commands::RandomBed { seqlens, num, output, - }) => granges_random_bed(seqlens, *num, output.as_ref()), + sort, + }) => granges_random_bed(seqlens, *num, output.as_ref(), *sort), None => { + println!("{}\n", INFO); std::process::exit(1); } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 740e42c..5669bb5 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -2,7 +2,11 @@ //! //! -use crate::{error::GRangesError, traits::TsvSerialize, Position}; +use crate::{ + error::GRangesError, + traits::{GenericRange, TsvSerialize, IndexedDataContainer}, + Position, +}; pub mod coitrees; pub mod operations; @@ -24,6 +28,24 @@ impl RangeEmpty { } } +impl GenericRange for RangeEmpty { + fn start(&self) -> Position { + self.start + } + fn end(&self) -> Position { + self.end + } + fn index(&self) -> Option { + None + } + fn set_start(&mut self, start: Position) { + self.start = start + } + fn set_end(&mut self, end: Position) { + self.end = end + } +} + /// [`RangeIndexed`] is a range with a valid /// index to a data element in the data container. #[derive(Clone, Debug, Default, PartialEq)] @@ -43,6 +65,24 @@ impl RangeIndexed { } } +impl GenericRange for RangeIndexed { + fn start(&self) -> Position { + self.start + } + fn end(&self) -> Position { + self.end + } + fn index(&self) -> Option { + Some(self.index) + } + fn set_start(&mut self, start: Position) { + self.start = start + } + fn set_end(&mut self, end: Position) { + self.end = end + } +} + /// Represents a parsed range entry, possibly containing some data. #[derive(Debug, Clone, PartialEq)] pub struct RangeRecord { @@ -63,12 +103,55 @@ impl RangeRecord { } } +impl GenericRange for RangeRecord { + fn start(&self) -> Position { + self.start + } + fn end(&self) -> Position { + self.end + } + fn index(&self) -> Option { + None + } + fn set_start(&mut self, start: Position) { + self.start = start + } + fn set_end(&mut self, end: Position) { + self.end = end + } +} + impl TsvSerialize for RangeRecord<()> { fn to_tsv(&self) -> String { format!("{}\t{}\t{}", self.seqname, self.start, self.end) } } +impl TsvSerialize for RangeRecord> { + fn to_tsv(&self) -> String { + match &self.data { + None => { + format!( + "{}\t{}\t{}", + self.seqname, + self.start, + self.end, + ) + }, + Some(data) => { + format!( + "{}\t{}\t{}\t{}", + self.seqname, + self.start, + self.end, + data.to_tsv() + ) + + } + } + } +} + impl TsvSerialize for RangeRecord { fn to_tsv(&self) -> String { format!( @@ -77,7 +160,7 @@ impl TsvSerialize for RangeRecord { self.start, self.end, self.data.to_tsv() - ) + ) } } @@ -107,6 +190,24 @@ impl RangeEmptyRecord { } } +impl GenericRange for RangeEmptyRecord { + fn start(&self) -> Position { + self.start + } + fn end(&self) -> Position { + self.end + } + fn index(&self) -> Option { + None + } + fn set_start(&mut self, start: Position) { + self.start = start + } + fn set_end(&mut self, end: Position) { + self.end = end + } +} + /// Represents a range entry, with indices to sequence name and data. #[derive(Debug, Clone, PartialEq)] pub struct RangeIndexedRecord { @@ -125,6 +226,35 @@ impl RangeIndexedRecord { index, } } + pub fn to_record<'a, T>(self, seqnames: &Vec, data: &'a T) + -> RangeRecord<>::Item> + where T: IndexedDataContainer<'a> + TsvSerialize + { + RangeRecord { + seqname: seqnames[self.seqname_index].clone(), + start: self.start, + end: self.end, + data: data.get_value(self.index), + } + } +} + +impl GenericRange for RangeIndexedRecord { + fn start(&self) -> Position { + self.start + } + fn end(&self) -> Position { + self.end + } + fn index(&self) -> Option { + Some(self.index) + } + fn set_start(&mut self, start: Position) { + self.start = start + } + fn set_end(&mut self, end: Position) { + self.end = end + } } /// Validates whether a given range is valid for accessing a sequence of a given `length`. @@ -141,15 +271,15 @@ pub fn validate_range( start: Position, end: Position, length: Position, -) -> Result<(), GRangesError> { + ) -> Result<(), GRangesError> { if start > end { return Err(GRangesError::InvalidGenomicRange(start, end)); } if end >= length { return Err(GRangesError::InvalidGenomicRangeForSequence( - start, end, length, - )); + start, end, length, + )); } Ok(()) } @@ -163,9 +293,9 @@ mod tests { fn test_invalid_range_start_end() { let result = validate_range(5, 1, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRange(5, 1)) - )); + result, + Err(GRangesError::InvalidGenomicRange(5, 1)) + )); } #[test] @@ -178,8 +308,8 @@ mod tests { fn test_invalid_range_length() { let result = validate_range(1, 10, 10); assert!(matches!( - result, - Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) - )); + result, + Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) + )); } } diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index a32a501..00dc4e9 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -2,80 +2,18 @@ //! //! - [`adjust()`]: Adjust range start and end positions. -use crate::{Position, PositionOffset}; - -use super::{RangeEmpty, RangeIndexed, RangeRecord}; - -/// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] -/// and returning `None` if the range has zero width after adjustment. -pub fn adjust_range_record( - range: RangeRecord, - start_delta: PositionOffset, - end_delta: PositionOffset, - length: Position, -) -> Option> { - let start: PositionOffset = range.start.try_into().unwrap(); - let end: PositionOffset = range.end.try_into().unwrap(); - let length: PositionOffset = length.try_into().unwrap(); - - // ensure within [0, length] - let new_start = (start + start_delta).max(0).min(length); - // ensure new_end >= new_start and within [0, length] - let new_end = (end + end_delta).max(new_start).min(length); - - // check for zero-width range - if new_end <= new_start { - // return None if the range has zero width - None - } else { - Some(RangeRecord::new( - range.seqname, - new_start.try_into().unwrap(), - new_end.try_into().unwrap(), - range.data, - )) - } -} - -/// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] -/// and returning `None` if the range has zero width after adjustment. -pub fn adjust_empty( - range: RangeEmpty, - start_delta: PositionOffset, - end_delta: PositionOffset, - length: Position, -) -> Option { - let start: PositionOffset = range.start.try_into().unwrap(); - let end: PositionOffset = range.end.try_into().unwrap(); - let length: PositionOffset = length.try_into().unwrap(); - - // ensure within [0, length] - let new_start = (start + start_delta).max(0).min(length); - // ensure new_end >= new_start and within [0, length] - let new_end = (end + end_delta).max(new_start).min(length); - - // check for zero-width range - if new_end <= new_start { - // return None if the range has zero width - None - } else { - Some(RangeEmpty::new( - new_start.try_into().unwrap(), - new_end.try_into().unwrap(), - )) - } -} +use crate::{traits::GenericRange, Position, PositionOffset}; /// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] /// and returning `None` if the range has zero width after adjustment. -pub fn adjust( - range: RangeIndexed, +pub fn adjust_range( + mut range: R, start_delta: PositionOffset, end_delta: PositionOffset, length: Position, -) -> Option { - let start: PositionOffset = range.start.try_into().unwrap(); - let end: PositionOffset = range.end.try_into().unwrap(); +) -> Option { + let start: PositionOffset = range.start().try_into().unwrap(); + let end: PositionOffset = range.end().try_into().unwrap(); let length: PositionOffset = length.try_into().unwrap(); // ensure within [0, length] @@ -88,35 +26,34 @@ pub fn adjust( // return None if the range has zero width None } else { - Some(RangeIndexed::new( - new_start.try_into().unwrap(), - new_end.try_into().unwrap(), - range.index, - )) + range.set_start(new_start.try_into().unwrap()); + range.set_end(new_end.try_into().unwrap()); + Some(range) } } #[cfg(test)] mod tests { + use crate::ranges::RangeIndexed; use super::*; #[test] fn test_normal_adjustment() { let range = RangeIndexed::new(5, 10, 1); - let adjusted = adjust(range, -2, 3, 15).unwrap(); + let adjusted = adjust_range(range, -2, 3, 15).unwrap(); assert_eq!(adjusted, RangeIndexed::new(3, 13, 1)); } #[test] fn test_out_of_bounds_adjustment() { let range = RangeIndexed::new(10, 12, 2); - let adjusted = adjust(range, -5, 20, 15).unwrap(); + let adjusted = adjust_range(range, -5, 20, 15).unwrap(); assert_eq!(adjusted, RangeIndexed::new(5, 15, 2)); } #[test] fn test_zero_width_result() { let range = RangeIndexed::new(5, 10, 3); - assert!(adjust(range, 5, -5, 15).is_none()); + assert!(adjust_range(range, 5, -5, 15).is_none()); } } diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index 2cd5ae6..e40c698 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -1,5 +1,5 @@ use super::{validate_range, RangeEmpty, RangeIndexed}; -use crate::traits::{RangesIntoIterable, RangesIterable}; +use crate::traits::{GenericRange, RangesIntoIterable, RangesIterable}; use crate::{error::GRangesError, traits::RangeContainer, Position}; pub type VecRangesIndexed = VecRanges; @@ -41,6 +41,18 @@ impl VecRanges { } } +impl VecRanges { + /// Sort all the ranges. + pub fn sort(&mut self) { + self.ranges.sort_by(|a, b| { + a.start() + .cmp(&b.start()) + .then_with(|| a.end().cmp(&b.end())) + .then_with(|| a.index().cmp(&b.index())) + }); + } +} + impl RangeContainer for VecRanges { fn len(&self) -> usize { self.ranges.len() diff --git a/src/main/reporting.rs b/src/reporting.rs similarity index 100% rename from src/main/reporting.rs rename to src/reporting.rs diff --git a/src/traits.rs b/src/traits.rs index 1461c6f..0b31aed 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1,7 +1,16 @@ //! Traits used by the GRanges library. //! -use crate::{error::GRangesError, io::parsers::FilteredIntervals, ranges::RangeRecord}; +use crate::{error::GRangesError, io::parsers::FilteredIntervals, ranges::RangeRecord, Position}; + +/// +pub trait GenericRange: Clone { + fn start(&self) -> Position; + fn end(&self) -> Position; + fn index(&self) -> Option; + fn set_start(&mut self, start: Position); + fn set_end(&mut self, end: Position); +} /// Defines functionality common to all range containers, e.g. [`VecRanges`] and /// [`COITrees`]. @@ -88,6 +97,12 @@ pub trait TsvSerialize { fn to_tsv(&self) -> String; } +impl TsvSerialize for &String { + fn to_tsv(&self) -> String { + self.to_string() + } +} + impl TsvSerialize for String { fn to_tsv(&self) -> String { self.to_string() diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs new file mode 100644 index 0000000..01965a7 --- /dev/null +++ b/tests/bedtools_validation.rs @@ -0,0 +1,55 @@ +//! Validation against bedtools + +use std::{process::Command, path::PathBuf}; +use granges::commands::granges_random_bed; +use tempfile::NamedTempFile; + + +fn granges_binary_path() -> PathBuf { + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + path.push("target"); + path.push(if cfg!(debug_assertions) { "debug" } else { "release" }); + path.push("granges"); + path +} + +#[test] +fn test_against_bedtools_slop() { + + let random_bedfile = NamedTempFile::new().expect("Failed to create temp file"); + let random_bedfile_path = random_bedfile.path().to_path_buf(); + granges_random_bed("tests_data/hg38_seqlens.tsv", 100_000, + Some(&random_bedfile_path), true) + .expect("could not generate random BED file"); + + let width = 10; + + let bedtools_output = Command::new("bedtools") + .arg("slop") + .arg("-g") + .arg("tests_data/hg38_seqlens.tsv") + .arg("-b") + .arg(width.to_string()) + .arg("-i") + .arg(random_bedfile.path()) + .output() + .expect("bedtools slop failed"); + + let granges_output = Command::new(granges_binary_path()) + .arg("adjust") + .arg("--seqlens") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--both") + .arg(width.to_string()) + .arg("--sort") + .arg(random_bedfile.path()) + .output() + .expect("granges adjust failed"); + + assert!(bedtools_output.status.success()); + assert!(granges_output.status.success()); + + assert_eq!(String::from_utf8_lossy(&bedtools_output.stdout), + String::from_utf8_lossy(&granges_output.stdout)); +} +