Skip to content

Commit

Permalink
granges flank validation tests, and many bug fixes.
Browse files Browse the repository at this point in the history
 - Passing granges flank validation tests against bedtools.
 - granges flank now has streaming and in-memory mode; --in-mem
   argument.
 - Fixed bug with flank ranges method.
 - Fixed bugs with output.
 - Fixed `TsvSerialize` issues.
  • Loading branch information
vsbuffalo committed Feb 21, 2024
1 parent b7849fa commit b4b1a1b
Show file tree
Hide file tree
Showing 7 changed files with 328 additions and 50 deletions.
128 changes: 108 additions & 20 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@ use std::path::PathBuf;
use crate::{
io::{parsers::GenomicRangesParser, OutputFile},
prelude::*,
ranges::operations::adjust_range,
ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord},
reporting::{CommandOutput, Report},
test_utilities::random_granges,
traits::TsvSerialize,
Position, PositionOffset,
};

#[derive(Clone)]
pub enum ProcessingMode {
Streaming,
InMemory,
}

/// Adjust the genomic ranges in a bedfile by some specified amount.
pub fn granges_adjust(
bedfile: &PathBuf,
Expand Down Expand Up @@ -200,31 +206,113 @@ pub fn granges_flank(
right: Option<Position>,
output: Option<&PathBuf>,
skip_missing: bool,
mode: ProcessingMode,
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();
let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;

let report = Report::new();
match ranges_iter {
GenomicRangesParser::Bed3(iter) => {
let gr = if skip_missing {
GRangesEmpty::from_iter(iter.retain_seqnames(&seqnames), &genome)?
} else {
GRangesEmpty::from_iter(iter, &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output)?
}
GenomicRangesParser::Bedlike(iter) => {
let gr = if skip_missing {
GRanges::from_iter(iter.try_unwrap_data().retain_seqnames(&seqnames), &genome)?
} else {
GRanges::from_iter(iter.try_unwrap_data(), &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output)?
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)

match mode {
// Note: this is kept for benchmarking, to see how costly building GRanges
// objects is versus using streaming.
ProcessingMode::InMemory => match ranges_iter {
GenomicRangesParser::Bed3(iter) => {
let gr = if skip_missing {
GRangesEmpty::from_iter(iter.retain_seqnames(&seqnames), &genome)?
} else {
GRangesEmpty::from_iter(iter, &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output)?
}
GenomicRangesParser::Bedlike(iter) => {
let gr = if skip_missing {
GRanges::from_iter(iter.try_unwrap_data().retain_seqnames(&seqnames), &genome)?
} else {
GRanges::from_iter(iter.try_unwrap_data(), &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output)?
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
}
},
ProcessingMode::Streaming => {
// Setup Output stream -- header is None for now (TODO).
let output_stream = output.map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
});
let mut writer = output_stream.writer()?;

match ranges_iter {
// FIXME: code redundancy. But too early now to design traits, etc.
GenomicRangesParser::Bed3(iter) => {
if skip_missing {
for record in iter.retain_seqnames(&seqnames) {
let range = record?;
let seqname = &range.seqname;
let length = *genome
.get(seqname)
.ok_or(GRangesError::MissingSequence(seqname.to_string()))?;

let flanking_ranges = range
.flanking_ranges::<GenomicRangeRecord<String>>(left, right, length);
for flanking_range in flanking_ranges {
writer.write_all(&flanking_range.to_tsv().into_bytes())?;
}
}
} else {
for record in iter {
let range = record?;
let seqname = &range.seqname;
let length = *genome
.get(seqname)
.ok_or(GRangesError::MissingSequence(seqname.to_string()))?;

let flanking_ranges = range
.flanking_ranges::<GenomicRangeEmptyRecord>(left, right, length);
for flanking_range in flanking_ranges {
writer.write_all(&flanking_range.to_tsv().into_bytes())?;
}
}
}
}
GenomicRangesParser::Bedlike(iter) => {
if skip_missing {
for record in iter.retain_seqnames(&seqnames) {
let range = record?;
let seqname = &range.seqname;
let length = *genome
.get(seqname)
.ok_or(GRangesError::MissingSequence(seqname.to_string()))?;

let flanking_ranges = range
.flanking_ranges::<GenomicRangeRecord<String>>(left, right, length);
for flanking_range in flanking_ranges {
writer.write_all(&flanking_range.to_tsv().into_bytes())?;
}
}
} else {
for record in iter {
let range = record?;
let seqname = &range.seqname;
let length = *genome
.get(seqname)
.ok_or(GRangesError::MissingSequence(seqname.to_string()))?;

let flanking_ranges = range
.flanking_ranges::<GenomicRangeEmptyRecord>(left, right, length);
for flanking_range in flanking_ranges {
writer.write_all(&flanking_range.to_tsv().into_bytes())?;
}
}
}
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
}
}
}
}
Ok(CommandOutput::new((), report))
Expand Down
17 changes: 10 additions & 7 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -770,11 +770,11 @@ mod tests {
let mut gr_left_iter = gr_left.iter_ranges();
let first_range = gr_left_iter.next().unwrap();
assert_eq!(first_range.start(), 20);
assert_eq!(first_range.end(), 31);
assert_eq!(first_range.end(), 30);

let second_range = gr_left_iter.next().unwrap();
assert_eq!(second_range.start(), 90);
assert_eq!(second_range.end(), 101);
assert_eq!(second_range.end(), 100);
}

#[test]
Expand All @@ -787,13 +787,16 @@ mod tests {
let mut gr_left_iter = gr_left.iter_ranges();
let first_range = gr_left_iter.next().unwrap();
assert_eq!(first_range.start(), 20);
assert_eq!(first_range.end(), 31);
assert_eq!(first_range.end(), 30);

// NOTE: the next flank would have been a zero-width
// right range for the first range. But this is skipped,
// because it is zero-width. Subtle edge case!

// Now the next should be the new right flank.
let second_range = gr_left_iter.next().unwrap();
assert_eq!(second_range.start(), 49);

// the end point is the sequence length, since we can only go to end.
assert_eq!(second_range.end(), 50);
assert_eq!(second_range.start(), 90);
let second_range = gr_left_iter.next().unwrap();
assert_eq!(second_range.end(), 210);
}
}
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ macro_rules! ensure_eq {
match (&$left, &$right) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
panic!("{}\nExpected `{}` but found `{}`.", crate::INTERNAL_ERROR_MESSAGE, stringify!($left), stringify!($right));
panic!("{}\nExpected `{}` but found `{}`.", $crate::INTERNAL_ERROR_MESSAGE, stringify!($left), stringify!($right));

}
}
Expand All @@ -91,7 +91,7 @@ macro_rules! ensure_eq {
match (&$left, &$right) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
panic!("{}\n{}\nExpected `{}` but found `{}`.", crate::INTERNAL_ERROR_MESSAGE, format_args!($($arg)+), stringify!($left), stringify!($right));
panic!("{}\n{}\nExpected `{}` but found `{}`.", $crate::INTERNAL_ERROR_MESSAGE, format_args!($($arg)+), stringify!($left), stringify!($right));
}
}
}
Expand Down
22 changes: 20 additions & 2 deletions src/main/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::path::PathBuf;

use clap::{Parser, Subcommand};
use granges::{
commands::{granges_adjust, granges_filter, granges_flank},
commands::{granges_adjust, granges_filter, granges_flank, ProcessingMode},
prelude::GRangesError,
Position, PositionOffset,
};
Expand Down Expand Up @@ -110,6 +110,10 @@ enum Commands {
/// By default, ranges with sequence names not in the genome file will raise an error.
#[arg(short = 'f', long)]
skip_missing: bool,

/// Processing mode
#[arg(long)]
in_mem: bool,
},

#[cfg(feature = "dev-commands")]
Expand Down Expand Up @@ -157,6 +161,7 @@ fn run() -> Result<(), GRangesError> {
right,
output,
skip_missing,
in_mem,
}) => {
if both.is_some() && (left.is_some() || right.is_some()) {
let error = clap::Error::raw(
Expand All @@ -174,7 +179,20 @@ fn run() -> Result<(), GRangesError> {
);
return Err(error.into());
}
granges_flank(genome, bedfile, left, right, output.as_ref(), *skip_missing)
let mode = if *in_mem {
ProcessingMode::InMemory
} else {
ProcessingMode::Streaming
};
granges_flank(
genome,
bedfile,
left,
right,
output.as_ref(),
*skip_missing,
mode,
)
}
#[cfg(feature = "dev-commands")]
Some(Commands::RandomBed {
Expand Down
Loading

0 comments on commit b4b1a1b

Please sign in to comment.