Skip to content

Commit

Permalink
GenomicRangeRecordUnwrappable trait added, some renaming, better CLI …
Browse files Browse the repository at this point in the history
…ergo.

 - Changed --seqlens to --genome.
 - Added short arguments.
 - Added skip_missing to skip over ranges not in the genome file.
 - skip_missing required new filtering, so introduced the
   `GenomicRangeRecordUnwrappable` trait. This implements the
   `try_unwrap_data()` method (which was removed from `BedlikeIterator`.
   The previous method in `BedlikeIterator` returned `impl Iterator`
   which since it isn't a concrete type, caused lots of ergonomic issues.
 - Introduced new `UnwrappedRanges` type for the above.
 - Removed `into_variants()`.
 - Implemented `retain_seqnames()` and `exclude_seqnames()` for `UnwrappedRanges`.
  • Loading branch information
vsbuffalo committed Feb 20, 2024
1 parent 3935be9 commit ce69571
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 131 deletions.
4 changes: 2 additions & 2 deletions benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ fn bench_range_adjustment(c: &mut Criterion) {
b.iter(|| {
let granges_output = Command::new(granges_binary_path())
.arg("adjust")
.arg("--seqlens")
.arg("--genome")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--both")
.arg("10")
Expand Down Expand Up @@ -86,7 +86,7 @@ fn bench_filter_adjustment(c: &mut Criterion) {
b.iter(|| {
let granges_output = Command::new(granges_binary_path())
.arg("filter")
.arg("--seqlens")
.arg("--genome")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--left")
.arg(&random_bedfile_left)
Expand Down
64 changes: 53 additions & 11 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ pub fn granges_adjust(
// we know that the records *do* have data. Unwrapping the Option<String>
// values means that writing to TSV doesn't have to deal with this (which
// always creates headaches).
let gr = GRanges::from_iter(iter.try_unwrap_data()?, &genome)?;
let gr = GRanges::from_iter(iter.try_unwrap_data(), &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
}
GenomicRangesParser::Unsupported => {
Expand All @@ -94,9 +94,11 @@ pub fn granges_filter(
left_path: &PathBuf,
right_path: &PathBuf,
output: Option<&PathBuf>,
skip_missing: bool,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();

let left_iter = GenomicRangesFile::parsing_iterator(left_path)?;
let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;
Expand All @@ -106,19 +108,38 @@ pub fn granges_filter(

match (left_iter, right_iter) {
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => {
let left_gr = GRangesEmpty::from_iter(left, &genome)?;
let right_gr = GRangesEmpty::from_iter(right, &genome)?;
let left_gr;
let right_gr;

if skip_missing {
left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRangesEmpty::from_iter(left, &genome)?;
right_gr = GRangesEmpty::from_iter(right, &genome)?;
}

let right_gr = right_gr.into_coitrees()?;

let mut intersection = left_gr.filter_overlaps(&right_gr)?;
let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => {
let left_gr = GRangesEmpty::from_iter(left, &genome)?;
let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?;
let left_gr;
let right_gr;

if skip_missing {
left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRangesEmpty::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}

let right_gr = right_gr.into_coitrees()?;

Expand All @@ -128,8 +149,17 @@ pub fn granges_filter(
Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => {
let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?;
let right_gr = GRangesEmpty::from_iter(right, &genome)?;
let left_gr;
let right_gr;

if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRangesEmpty::from_iter(right, &genome)?;
}

let right_gr = right_gr.into_coitrees()?;

Expand All @@ -139,8 +169,20 @@ pub fn granges_filter(
Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => {
let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?;
let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?;
let left_gr;
let right_gr;

if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}

let right_gr = right_gr.into_coitrees()?;

Expand All @@ -163,7 +205,7 @@ pub fn granges_random_bed(
// get the genome info
let genome = read_seqlens(seqlens)?;

let mut gr = random_granges(&genome, num.try_into().unwrap())?;
let mut gr = random_granges(&genome, num)?;

if sort {
gr = gr.sort();
Expand Down
2 changes: 1 addition & 1 deletion src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub enum GRangesError {
InvalidGenomicRange(Position, Position),
#[error("Range [{0}, {1}] is invalid for sequence of length {2}")]
InvalidGenomicRangeForSequence(Position, Position, Position),
#[error("Sequence name '{0}' is not the ranges container")]
#[error("Sequence name '{0}' is not in the ranges container")]
MissingSequence(String),
#[error("Error encountered in genomap::GenomeMap")]
GenomeMapError(#[from] GenomeMapError),
Expand Down
162 changes: 90 additions & 72 deletions src/io/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ use std::path::{Path, PathBuf};
use crate::error::GRangesError;
use crate::io::file::InputFile;
use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord};
use crate::traits::GeneralRangeRecordIterator;
use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable};
use crate::Position;

/// Get the *base* extension to help infer filetype, which ignores compression-related
Expand Down Expand Up @@ -365,37 +365,10 @@ impl BedlikeIterator {
self.number_columns() == 3
}

/// Try to unwrap each [`GenomicRangeRecord<Option<String>>`] iterator item into a
/// [`GenomicRangeRecord<String>`] iterator item.
///
/// This will raise errors if:
/// 1. The detected number of columns is < 4 (i.e. there appears to be no data to unwrap).
/// 2. During iteration, a `None` data element is encountered.
pub fn try_unwrap_data(
self,
) -> Result<impl Iterator<Item = Result<GenomicRangeRecord<String>, GRangesError>>, GRangesError>
{
if self.number_columns() < 4 {
return Err(GRangesError::TooFewColumns)?;
}
Ok(self.iter.map(|result| {
result.and_then(|record| {
if let Some(data) = record.data {
Ok(GenomicRangeRecord::new(
record.seqname,
record.start,
record.end,
data,
))
} else {
Err(GRangesError::TryUnwrapDataError)
}
})
}))
}

/// Drop the data in each [`GenomicRangeRecord<Option<String>>`] iterator, converting it to a range-only
/// [`GenomicRangeRecord<()>`] iterator item.
// TODO: candidate for trait? the impl Iterator isn't a concrete type and can be cumbersome
// in terms of ergonomics. See UnwrappedRanges for an example.
pub fn drop_data(self) -> impl Iterator<Item = Result<GenomicRangeRecord<()>, GRangesError>> {
self.iter.map(|result| {
result
Expand All @@ -410,37 +383,6 @@ impl BedlikeIterator {
.unwrap_or_else(Err) // pass through parsing errors
})
}

/// Try to convert the iterator into one of two variants: one with data and one without.
pub fn into_variant(self) -> Result<GenomicRangesIteratorVariant, GRangesError> {
let number_columns = self.number_columns();

if number_columns == 3 {
let without_data_iterator = self.drop_data().map(|result| {
result.map(|record| GenomicRangeRecord {
seqname: record.seqname,
start: record.start,
end: record.end,
data: (),
})
});
Ok(GenomicRangesIteratorVariant::Empty(Box::new(
without_data_iterator,
)))
} else {
let with_data_iterator = self.try_unwrap_data()?.map(|result| {
result.map(|record| GenomicRangeRecord {
seqname: record.seqname,
start: record.start,
end: record.end,
data: record.data,
})
});
Ok(GenomicRangesIteratorVariant::WithData(Box::new(
with_data_iterator,
)))
}
}
}

impl Iterator for BedlikeIterator {
Expand All @@ -465,7 +407,7 @@ impl Iterator for BedlikeIterator {
///
/// let iter = Bed3Iterator::new("tests_data/example.bed")
/// .expect("error reading file")
/// .exclude_seqnames(vec!["chr1".to_string()]);
/// .exclude_seqnames(&vec!["chr1".to_string()]);
///
/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 };
/// let gr = GRangesEmpty::from_iter(iter, &seqlens)
Expand All @@ -479,6 +421,7 @@ impl Iterator for BedlikeIterator {
/// assert_eq!(iter.next().unwrap().end, 15);
/// assert_eq!(iter.next(), None);
/// ```
#[derive(Debug)]
pub struct FilteredRanges<I, R>
where
I: Iterator<Item = Result<R, GRangesError>>,
Expand All @@ -494,11 +437,11 @@ where
{
pub fn new(
inner: I,
retain_seqnames: Option<Vec<String>>,
exclude_seqnames: Option<Vec<String>>,
retain_seqnames: Option<&Vec<String>>,
exclude_seqnames: Option<&Vec<String>>,
) -> Self {
let retain_seqnames = retain_seqnames.map(HashSet::from_iter);
let exclude_seqnames = exclude_seqnames.map(HashSet::from_iter);
let retain_seqnames = retain_seqnames.cloned().map(HashSet::from_iter);
let exclude_seqnames = exclude_seqnames.cloned().map(HashSet::from_iter);
Self {
inner,
retain_seqnames,
Expand Down Expand Up @@ -575,18 +518,93 @@ where
}
}

impl GeneralRangeRecordIterator<GenomicRangeEmptyRecord> for Bed3Iterator {
impl GeneralRangeRecordIterator<GenomicRangeRecord<Option<String>>> for BedlikeIterator {
fn retain_seqnames(
self,
seqnames: Vec<String>,
) -> FilteredRanges<Self, GenomicRangeEmptyRecord> {
FilteredRanges::new(self, Some(seqnames), None)
seqnames: &[String],
) -> FilteredRanges<Self, GenomicRangeRecord<Option<String>>> {
FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
}
fn exclude_seqnames(
self,
seqnames: Vec<String>,
seqnames: &[String],
) -> FilteredRanges<Self, GenomicRangeRecord<Option<String>>> {
FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
}
}

impl GeneralRangeRecordIterator<GenomicRangeEmptyRecord> for Bed3Iterator {
fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges<Self, GenomicRangeEmptyRecord> {
FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
}
fn exclude_seqnames(
self,
seqnames: &[String],
) -> FilteredRanges<Self, GenomicRangeEmptyRecord> {
FilteredRanges::new(self, None, Some(seqnames))
FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
}
}

impl<I> GeneralRangeRecordIterator<GenomicRangeRecord<String>> for UnwrappedRanges<I>
where
I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
{
fn retain_seqnames(
self,
seqnames: &[String],
) -> FilteredRanges<Self, GenomicRangeRecord<String>> {
FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
}
fn exclude_seqnames(
self,
seqnames: &[String],
) -> FilteredRanges<Self, GenomicRangeRecord<String>> {
FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
}
}

#[derive(Debug)]
pub struct UnwrappedRanges<I>
where
I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
{
inner: I,
}

impl<I> UnwrappedRanges<I>
where
I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
{
pub fn new(inner: I) -> Self {
Self { inner }
}
}

impl<I> Iterator for UnwrappedRanges<I>
where
I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
{
type Item = Result<GenomicRangeRecord<String>, GRangesError>;

fn next(&mut self) -> Option<Self::Item> {
if let Some(result) = self.inner.next() {
return Some(result.and_then(|record| match record.data {
Some(data) => Ok(GenomicRangeRecord::new(
record.seqname,
record.start,
record.end,
data,
)),
None => Err(GRangesError::TryUnwrapDataError),
}));
}
None
}
}

impl GenomicRangeRecordUnwrappable for BedlikeIterator {
fn try_unwrap_data(self) -> UnwrappedRanges<Self> {
UnwrappedRanges::new(self)
}
}

Expand Down
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ pub mod prelude {

pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
pub use crate::traits::{
AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenomicRangesTsvSerialize,
IndexedDataContainer, IntoIterableRangesContainer, IterableRangeContainer, TsvSerialize,
AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenomicRangeRecordUnwrappable,
GenomicRangesTsvSerialize, IndexedDataContainer, IntoIterableRangesContainer,
IterableRangeContainer, TsvSerialize,
};

pub use crate::seqlens;
Expand Down
Loading

0 comments on commit ce69571

Please sign in to comment.