From b4b1a1b21a52e10358cfbaac3c1a54dbfeb8365e Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Tue, 20 Feb 2024 21:12:35 -0800 Subject: [PATCH] granges flank validation tests, and many bug fixes. - Passing granges flank validation tests against bedtools. - granges flank now has streaming and in-memory mode; --in-mem argument. - Fixed bug with flank ranges method. - Fixed bugs with output. - Fixed `TsvSerialize` issues. --- src/commands.rs | 128 +++++++++++++++++++++++++++++------ src/granges.rs | 17 +++-- src/lib.rs | 4 +- src/main/mod.rs | 22 +++++- src/ranges/mod.rs | 118 ++++++++++++++++++++++++++++---- src/ranges/operations.rs | 5 +- tests/bedtools_validation.rs | 84 ++++++++++++++++++++++- 7 files changed, 328 insertions(+), 50 deletions(-) diff --git a/src/commands.rs b/src/commands.rs index 9f7cbb3..b2601b4 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -3,13 +3,19 @@ use std::path::PathBuf; use crate::{ io::{parsers::GenomicRangesParser, OutputFile}, prelude::*, - ranges::operations::adjust_range, + ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord}, reporting::{CommandOutput, Report}, test_utilities::random_granges, traits::TsvSerialize, Position, PositionOffset, }; +#[derive(Clone)] +pub enum ProcessingMode { + Streaming, + InMemory, +} + /// Adjust the genomic ranges in a bedfile by some specified amount. pub fn granges_adjust( bedfile: &PathBuf, @@ -200,31 +206,113 @@ pub fn granges_flank( right: Option, output: Option<&PathBuf>, skip_missing: bool, + mode: ProcessingMode, ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; let report = Report::new(); - match ranges_iter { - GenomicRangesParser::Bed3(iter) => { - let gr = if skip_missing { - GRangesEmpty::from_iter(iter.retain_seqnames(&seqnames), &genome)? - } else { - GRangesEmpty::from_iter(iter, &genome)? - }; - gr.flanking_ranges(left, right)?.to_tsv(output)? - } - GenomicRangesParser::Bedlike(iter) => { - let gr = if skip_missing { - GRanges::from_iter(iter.try_unwrap_data().retain_seqnames(&seqnames), &genome)? - } else { - GRanges::from_iter(iter.try_unwrap_data(), &genome)? - }; - gr.flanking_ranges(left, right)?.to_tsv(output)? - } - GenomicRangesParser::Unsupported => { - return Err(GRangesError::UnsupportedGenomicRangesFileFormat) + + match mode { + // Note: this is kept for benchmarking, to see how costly building GRanges + // objects is versus using streaming. + ProcessingMode::InMemory => match ranges_iter { + GenomicRangesParser::Bed3(iter) => { + let gr = if skip_missing { + GRangesEmpty::from_iter(iter.retain_seqnames(&seqnames), &genome)? + } else { + GRangesEmpty::from_iter(iter, &genome)? + }; + gr.flanking_ranges(left, right)?.to_tsv(output)? + } + GenomicRangesParser::Bedlike(iter) => { + let gr = if skip_missing { + GRanges::from_iter(iter.try_unwrap_data().retain_seqnames(&seqnames), &genome)? + } else { + GRanges::from_iter(iter.try_unwrap_data(), &genome)? + }; + gr.flanking_ranges(left, right)?.to_tsv(output)? + } + GenomicRangesParser::Unsupported => { + return Err(GRangesError::UnsupportedGenomicRangesFileFormat) + } + }, + ProcessingMode::Streaming => { + // Setup Output stream -- header is None for now (TODO). + let output_stream = output.map_or(OutputFile::new_stdout(None), |file| { + OutputFile::new(file, None) + }); + let mut writer = output_stream.writer()?; + + match ranges_iter { + // FIXME: code redundancy. But too early now to design traits, etc. + GenomicRangesParser::Bed3(iter) => { + if skip_missing { + for record in iter.retain_seqnames(&seqnames) { + let range = record?; + let seqname = &range.seqname; + let length = *genome + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + + let flanking_ranges = range + .flanking_ranges::>(left, right, length); + for flanking_range in flanking_ranges { + writer.write_all(&flanking_range.to_tsv().into_bytes())?; + } + } + } else { + for record in iter { + let range = record?; + let seqname = &range.seqname; + let length = *genome + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + + let flanking_ranges = range + .flanking_ranges::(left, right, length); + for flanking_range in flanking_ranges { + writer.write_all(&flanking_range.to_tsv().into_bytes())?; + } + } + } + } + GenomicRangesParser::Bedlike(iter) => { + if skip_missing { + for record in iter.retain_seqnames(&seqnames) { + let range = record?; + let seqname = &range.seqname; + let length = *genome + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + + let flanking_ranges = range + .flanking_ranges::>(left, right, length); + for flanking_range in flanking_ranges { + writer.write_all(&flanking_range.to_tsv().into_bytes())?; + } + } + } else { + for record in iter { + let range = record?; + let seqname = &range.seqname; + let length = *genome + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + + let flanking_ranges = range + .flanking_ranges::(left, right, length); + for flanking_range in flanking_ranges { + writer.write_all(&flanking_range.to_tsv().into_bytes())?; + } + } + } + } + GenomicRangesParser::Unsupported => { + return Err(GRangesError::UnsupportedGenomicRangesFileFormat) + } + } } } Ok(CommandOutput::new((), report)) diff --git a/src/granges.rs b/src/granges.rs index 39846c7..90c064f 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -770,11 +770,11 @@ mod tests { let mut gr_left_iter = gr_left.iter_ranges(); let first_range = gr_left_iter.next().unwrap(); assert_eq!(first_range.start(), 20); - assert_eq!(first_range.end(), 31); + assert_eq!(first_range.end(), 30); let second_range = gr_left_iter.next().unwrap(); assert_eq!(second_range.start(), 90); - assert_eq!(second_range.end(), 101); + assert_eq!(second_range.end(), 100); } #[test] @@ -787,13 +787,16 @@ mod tests { let mut gr_left_iter = gr_left.iter_ranges(); let first_range = gr_left_iter.next().unwrap(); assert_eq!(first_range.start(), 20); - assert_eq!(first_range.end(), 31); + assert_eq!(first_range.end(), 30); + + // NOTE: the next flank would have been a zero-width + // right range for the first range. But this is skipped, + // because it is zero-width. Subtle edge case! // Now the next should be the new right flank. let second_range = gr_left_iter.next().unwrap(); - assert_eq!(second_range.start(), 49); - - // the end point is the sequence length, since we can only go to end. - assert_eq!(second_range.end(), 50); + assert_eq!(second_range.start(), 90); + let second_range = gr_left_iter.next().unwrap(); + assert_eq!(second_range.end(), 210); } } diff --git a/src/lib.rs b/src/lib.rs index 49518af..82bd346 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -81,7 +81,7 @@ macro_rules! ensure_eq { match (&$left, &$right) { (left_val, right_val) => { if !(*left_val == *right_val) { - panic!("{}\nExpected `{}` but found `{}`.", crate::INTERNAL_ERROR_MESSAGE, stringify!($left), stringify!($right)); + panic!("{}\nExpected `{}` but found `{}`.", $crate::INTERNAL_ERROR_MESSAGE, stringify!($left), stringify!($right)); } } @@ -91,7 +91,7 @@ macro_rules! ensure_eq { match (&$left, &$right) { (left_val, right_val) => { if !(*left_val == *right_val) { - panic!("{}\n{}\nExpected `{}` but found `{}`.", crate::INTERNAL_ERROR_MESSAGE, format_args!($($arg)+), stringify!($left), stringify!($right)); + panic!("{}\n{}\nExpected `{}` but found `{}`.", $crate::INTERNAL_ERROR_MESSAGE, format_args!($($arg)+), stringify!($left), stringify!($right)); } } } diff --git a/src/main/mod.rs b/src/main/mod.rs index b97c451..05feb50 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -2,7 +2,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; use granges::{ - commands::{granges_adjust, granges_filter, granges_flank}, + commands::{granges_adjust, granges_filter, granges_flank, ProcessingMode}, prelude::GRangesError, Position, PositionOffset, }; @@ -110,6 +110,10 @@ enum Commands { /// By default, ranges with sequence names not in the genome file will raise an error. #[arg(short = 'f', long)] skip_missing: bool, + + /// Processing mode + #[arg(long)] + in_mem: bool, }, #[cfg(feature = "dev-commands")] @@ -157,6 +161,7 @@ fn run() -> Result<(), GRangesError> { right, output, skip_missing, + in_mem, }) => { if both.is_some() && (left.is_some() || right.is_some()) { let error = clap::Error::raw( @@ -174,7 +179,20 @@ fn run() -> Result<(), GRangesError> { ); return Err(error.into()); } - granges_flank(genome, bedfile, left, right, output.as_ref(), *skip_missing) + let mode = if *in_mem { + ProcessingMode::InMemory + } else { + ProcessingMode::Streaming + }; + granges_flank( + genome, + bedfile, + left, + right, + output.as_ref(), + *skip_missing, + mode, + ) } #[cfg(feature = "dev-commands")] Some(Commands::RandomBed { diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index c51ec13..5841a61 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -55,16 +55,19 @@ impl GenericRangeOperations for RangeEmpty { let mut flanking = Vec::new(); if let Some(left) = left_flank { let flank_start = std::cmp::max(self.start.saturating_sub(left), 0); - let flank_end = std::cmp::min(self.start + 1, seqlen); - - let left_flank_region = RangeEmpty::new(flank_start, flank_end); - flanking.push(left_flank_region); + let flank_end = std::cmp::min(self.start, seqlen); + if flank_end > flank_start { + let left_flank_region = RangeEmpty::new(flank_start, flank_end); + flanking.push(left_flank_region); + } } if let Some(right) = right_flank { - let flank_start = std::cmp::max(self.end.saturating_sub(1), 0); + let flank_start = std::cmp::max(self.end, 0); let flank_end = std::cmp::min(self.end + right, seqlen); - let right_flank_region = RangeEmpty::new(flank_start, flank_end); - flanking.push(right_flank_region); + if flank_end > flank_start { + let right_flank_region = RangeEmpty::new(flank_start, flank_end); + flanking.push(right_flank_region); + } } flanking } @@ -124,16 +127,19 @@ impl GenericRangeOperations for RangeIndexed { let mut flanking = Vec::new(); if let Some(left) = left_flank { let flank_start = std::cmp::max(self.start.saturating_sub(left), 0); - let flank_end = std::cmp::min(self.start + 1, seqlen); - - let left_flank_region = RangeIndexed::new(flank_start, flank_end, self.index); - flanking.push(left_flank_region); + let flank_end = std::cmp::min(self.start, seqlen); + if flank_end > flank_start { + let left_flank_region = RangeIndexed::new(flank_start, flank_end, self.index); + flanking.push(left_flank_region); + } } if let Some(right) = right_flank { - let flank_start = std::cmp::max(self.end.saturating_sub(1), 0); + let flank_start = std::cmp::max(self.end, 0); let flank_end = std::cmp::min(self.end + right, seqlen); - let right_flank_region = RangeIndexed::new(flank_start, flank_end, self.index); - flanking.push(right_flank_region); + if flank_end > flank_start { + let right_flank_region = RangeIndexed::new(flank_start, flank_end, self.index); + flanking.push(right_flank_region); + } } flanking } @@ -193,6 +199,45 @@ impl AdjustableGenericRange for GenomicRangeRecord { } } +impl GenericRangeOperations for GenomicRangeRecord { + /// Create flanking regions for this [`GenomicRangeRecord`] range. + fn flanking_ranges( + &self, + left_flank: Option, + right_flank: Option, + seqlen: Position, + ) -> Vec { + let mut flanking = Vec::new(); + if let Some(left) = left_flank { + let flank_start = std::cmp::max(self.start.saturating_sub(left), 0); + let flank_end = std::cmp::min(self.start, seqlen); + if flank_end > flank_start { + let left_flank_region = GenomicRangeRecord::new( + self.seqname.clone(), + flank_start, + flank_end, + self.data.clone(), + ); + flanking.push(left_flank_region); + } + } + if let Some(right) = right_flank { + let flank_start = std::cmp::max(self.end, 0); + let flank_end = std::cmp::min(self.end + right, seqlen); + if flank_end > flank_start { + let right_flank_region = GenomicRangeRecord::new( + self.seqname.clone(), + flank_start, + flank_end, + self.data.clone(), + ); + flanking.push(right_flank_region); + } + } + flanking + } +} + impl TsvSerialize for GenomicRangeRecord<()> { fn to_tsv(&self) -> String { format!("{}\t{}\t{}", self.seqname, self.start, self.end) @@ -219,6 +264,14 @@ impl TsvSerialize for GenomicRangeRecord<()> { // } impl TsvSerialize for GenomicRangeRecord { + /// Serialize this [`GenomicRangeRecord`] to TSV, using the recursive [`TsvSerialize`] + /// trait method. + /// + /// Note that this does *not* append the newline — it is up to + /// the `U.to_tsv()` trait method implementation. This is design is + /// intentional, since one common use case is when `U` is an [`Option`], + /// which is the type combination commonly used in lazy parsing. In this case, + /// when the data is `None`, that indicates no more columns. fn to_tsv(&self) -> String { format!( "{}\t{}\t{}{}", @@ -270,6 +323,43 @@ impl AdjustableGenericRange for GenomicRangeEmptyRecord { } } +impl GenericRangeOperations for GenomicRangeEmptyRecord { + /// Create flanking regions for this [`GenomicRangeEmptyRecord`] range. + fn flanking_ranges( + &self, + left_flank: Option, + right_flank: Option, + seqlen: Position, + ) -> Vec { + let mut flanking = Vec::new(); + if let Some(left) = left_flank { + let flank_start = std::cmp::max(self.start.saturating_sub(left), 0); + let flank_end = std::cmp::min(self.start, seqlen); + if flank_end > flank_start { + let left_flank_region = + GenomicRangeEmptyRecord::new(self.seqname.clone(), flank_start, flank_end); + flanking.push(left_flank_region); + } + } + if let Some(right) = right_flank { + let flank_start = std::cmp::max(self.end, 0); + let flank_end = std::cmp::min(self.end + right, seqlen); + if flank_end > flank_start { + let right_flank_region = + GenomicRangeEmptyRecord::new(self.seqname.clone(), flank_start, flank_end); + flanking.push(right_flank_region); + } + } + flanking + } +} + +impl TsvSerialize for GenomicRangeEmptyRecord { + fn to_tsv(&self) -> String { + format!("{}\t{}\t{}", self.seqname, self.start, self.end) + } +} + /// Represents a range entry, with indices to sequence name and possibly data. #[derive(Debug, Clone, PartialEq)] pub struct GenomicRangeIndexedRecord { diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index 2107d4c..fd746ec 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -2,10 +2,7 @@ //! //! - [`adjust()`]: Adjust range start and end positions. -use crate::{ - traits::{AdjustableGenericRange, GenericRange}, - Position, PositionOffset, -}; +use crate::{traits::AdjustableGenericRange, Position, PositionOffset}; /// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] /// and returning `None` if the range has zero width after adjustment. diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 93cc157..3608877 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -76,7 +76,7 @@ fn test_against_bedtools_slop() { /// granges filter --genome --left --right #[test] fn test_against_bedtools_intersect_wa() { - let num_ranges = 1_000_000; + let num_ranges = 100_000; let random_bedfile_left_tempfile = random_bedfile(num_ranges); let random_bedfile_right_tempfile = random_bedfile(num_ranges); @@ -125,3 +125,85 @@ fn test_against_bedtools_intersect_wa() { String::from_utf8_lossy(&granges_output.stdout) ); } + +/// Test bedtools flank -g -i -l 10 -r 20 +/// against +/// granges filter --genome --left 10 --right 20 +#[test] +fn test_against_bedtools_flank() { + let num_ranges = 100_000; + + let random_bedfile_tempfile = random_bedfile(num_ranges); + let random_bedfile = random_bedfile_tempfile.path(); + + // for testing: uncomment and results are local for inspection + // let random_bedfile_left = Path::new("test_left.bed"); + // let random_bedfile_right = Path::new("test_right.bed"); + + granges_random_bed( + "tests_data/hg38_seqlens.tsv", + num_ranges, + Some(&random_bedfile), + true, + ) + .expect("could not generate random BED file"); + + let bedtools_outfile = temp_bedfile(); + let bedtools_output = Command::new("bedtools") + .arg("flank") + .arg("-g") + .arg("tests_data/hg38_seqlens.tsv") + .arg("-l") + .arg("20") + .arg("-r") + .arg("30") + .arg("-i") + .arg(&random_bedfile) + .arg(bedtools_outfile.path()) + .output() + .expect("bedtools flank failed"); + + let bedtools_sorted_output = Command::new("bedtools") + .arg("sort") + .arg("-i") + .arg(bedtools_outfile.path()) + .output() + .expect("bedtools intersect failed"); + + let granges_outfile = temp_bedfile(); + let granges_output = Command::new(granges_binary_path()) + .arg("flank") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg("20") + .arg("--right") + .arg("30") + .arg(&random_bedfile) + .arg(granges_outfile.path()) + .output() + .expect("granges flank failed"); + + let granges_sorted_output = Command::new("bedtools") + .arg("sort") + .arg("-i") + .arg(bedtools_outfile.path()) + .output() + .expect("bedtools intersect failed"); + + assert!( + bedtools_sorted_output.status.success(), + "{:?}", + bedtools_sorted_output + ); + assert!( + granges_sorted_output.status.success(), + "{:?}", + granges_sorted_output + ); + + assert_eq!( + String::from_utf8_lossy(&bedtools_output.stdout), + String::from_utf8_lossy(&granges_output.stdout) + ); +}