Skip to content

Commit

Permalink
AdjustableGenericRange, adjust integration and all other tests passing.
Browse files Browse the repository at this point in the history
 - New AdjustableGenericRange trait, which splits out functionality for
   adjusting ranges. This is because coitrees internal range cannot be
   adjusted.

 - New integration test testing BED3 filetype detection on random
   ranges (this was just a byproduct of debugging), and is tested
   in other tests).

 - Changed `IntoGRangesRef` to `AsGRangesRef`.

 - Clippy fixes.

 - `GenomicRangeFile` returns `Path` now.

 - Debug for `TsvRecordIterator`.

 - Added `GenomicRangesFile.parsing_iterator()`.

 - Cleaned up integration tests, added suffix. A bug was caused by
   the random bedfile creation method not putting on .bed extension.
   Then I'd get an `UnsupportedGenomicRangesFileFormat` error. Very
   hard to find!
  • Loading branch information
vsbuffalo committed Feb 20, 2024
1 parent 0474a07 commit c151fbe
Show file tree
Hide file tree
Showing 13 changed files with 287 additions and 201 deletions.
53 changes: 36 additions & 17 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use std::path::PathBuf;

use crate::{
io::OutputFile,
io::{OutputFile, parsers::GenomicRangesParsers},
prelude::*,
ranges::operations::adjust_range,
reporting::{CommandOutput, Report},
test_utilities::random_granges,
traits::{IterableRangeContainer, TsvSerialize},
traits::TsvSerialize,
PositionOffset,
};

Expand All @@ -22,23 +22,23 @@ pub fn granges_adjust(
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;

// create the parsing iterator, and detect which variant we need based on
// column number of the first entry.
let bedlike_iterator = BedlikeIterator::new(bedfile)?;

// output stream -- header is None for now (TODO)
// Setup Output stream -- header is None for now (TODO).
let output_stream = output.map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
});
let mut writer = output_stream.writer()?;

// for reporting stuff to the user
// For reporting stuff to the user.
let mut report = Report::new();

let mut skipped_ranges = 0;

if !sort {
// if we don't need to sort, use iterator-based streaming processing
// Create the parsing iterator, and detect which variant we need based on
// column number of the first entry.
let bedlike_iterator = BedlikeIterator::new(bedfile)?;

// If we don't need to sort, use iterator-based streaming processing.
for record in bedlike_iterator {
let range = record?;
let seqname = &range.seqname;
Expand All @@ -56,17 +56,36 @@ pub fn granges_adjust(

if skipped_ranges > 0 {
report.add_issue(format!(
"{} ranges were removed because their widths after adjustment were ≤ 0",
skipped_ranges
))
"{} ranges were removed because their widths after adjustment were ≤ 0",
skipped_ranges
))
}
}
} else {
// if we do need to sort, build up a GRanges variant and adjust ranges that way
// let mut gr = GRanges::from_iter(bedlike_iterator, &genome)?;
// gr.adjust_ranges(-both, both).to_tsv(output)?
}
// If we do need to sort, build up a GRanges variant and adjust ranges through
// the GRanges interface. Note we need to detect and build a specific iterator
// for the filetype.

let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;
match ranges_iter {
GenomicRangesParsers::Bed3(iter) => {
let gr = GRangesEmpty::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
},
GenomicRangesParsers::Bedlike(iter) => {
// Note the call to try_unwrap_data() here: this is because
// we know that the records *do* have data. Unwrapping the Option<String>
// values means that writing to TSV doesn't have to deal with this (which
// always creates headaches).
let gr = GRanges::from_iter(iter.try_unwrap_data()?, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
},
GenomicRangesParsers::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
},

}
}
Ok(CommandOutput::new((), report))
}
//
Expand Down Expand Up @@ -107,7 +126,7 @@ pub fn granges_random_bed(
num: u32,
output: Option<impl Into<PathBuf>>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

Expand Down
68 changes: 33 additions & 35 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ use genomap::GenomeMap;
use indexmap::IndexMap;

use crate::{
io::{parsers::GenomicRangesIteratorVariant, OutputFile},
io::{OutputFile},
iterators::GRangesIterator,
prelude::GRangesError,
ranges::{
Expand All @@ -72,7 +72,7 @@ use crate::{
},
traits::{
GenericRange, GenomicRangesTsvSerialize,
IndexedDataContainer, RangeContainer, IterableRangeContainer, TsvSerialize, IntoGRangesRef,
IndexedDataContainer, RangeContainer, IterableRangeContainer, TsvSerialize, AsGRangesRef, AdjustableGenericRange,
},
Position, PositionOffset,
};
Expand Down Expand Up @@ -121,28 +121,28 @@ where
}
}

impl<C> Into<GRanges<C, ()>> for GRangesEmpty<C> {
fn into(self) -> GRanges<C, ()> {
self.0
impl<C> From<GRangesEmpty<C>> for GRanges<C, ()> {
fn from(value: GRangesEmpty<C>) -> Self {
value.0
}
}

impl<'a, C> IntoGRangesRef<'a, C, ()> for GRangesEmpty<C> {
impl<'a, C> AsGRangesRef<'a, C, ()> for GRangesEmpty<C> {
/// Convert a reference to a [`GRangesEmpty<C>`] to a reference to the
/// underlying [`GRanges<C, ()>`]. This is to greatly improve the ergonomics
/// of functions that could take either a [`GRanges`] or [`GRangesEmpty] type.
fn into_granges_ref(&'a self) -> &'a GRanges<C, ()> {
fn as_granges_ref(&'a self) -> &'a GRanges<C, ()> {
&self.0
}
}

impl<'a, C, T> IntoGRangesRef<'a, C, T> for GRanges<C, T> {
impl<'a, C, T> AsGRangesRef<'a, C, T> for GRanges<C, T> {
/// Return a reference of a [`GRanges<C, T>`] object. This is essentially
/// a pass-through method. [`IntoGRangesRef`] is not needed in this case,
/// but is needed elsewhere (see the implementation for [`GRangesEmpty`]) to
/// improve the ergonomics of working with [`GRanges`] and [`GRangesEmpty`] types.
fn into_granges_ref(&'a self) -> &'a GRanges<C, T> {
&self
fn as_granges_ref(&'a self) -> &'a GRanges<C, T> {
self
}
}

Expand Down Expand Up @@ -251,20 +251,23 @@ impl<R: GenericRange, T> GRanges<VecRanges<R>, T> {
self.ranges.values_mut().for_each(|ranges| ranges.sort());
self
}

pub fn shink(&mut self) {
todo!()
}
}


impl<R: AdjustableGenericRange, T> GRanges<VecRanges<R>, T> {
/// Adjust all the ranges in this [`GRanges`] object in place.
pub fn adjust_ranges(mut self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self {
self.ranges
.values_mut()
.for_each(|ranges| ranges.adjust_ranges(start_delta, end_delta));
self
}

pub fn shink(&mut self) {
todo!()
}
}


impl<R: GenericRange> GRangesEmpty<VecRanges<R>> {
/// Create a new [`GRangesEmpty`] object, with vector storage for ranges and no
Expand All @@ -273,19 +276,21 @@ impl<R: GenericRange> GRangesEmpty<VecRanges<R>> {
GRangesEmpty(GRanges::new_vec(seqlens))
}

pub fn sort(mut self) -> Self {
pub fn sort(self) -> Self {
GRangesEmpty(self.0.sort())
}

pub fn adjust_ranges(mut self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self {
GRangesEmpty(self.0.adjust_ranges(start_delta, end_delta))
}

pub fn shink(&mut self) {
todo!()
}
}

impl<R: AdjustableGenericRange> GRangesEmpty<VecRanges<R>> {
pub fn adjust_ranges(self, start_delta: PositionOffset, end_delta: PositionOffset) -> Self {
GRangesEmpty(self.0.adjust_ranges(start_delta, end_delta))
}
}

impl<U> GRanges<VecRangesIndexed, Vec<U>> {
/// Push a genomic range with its data to the range and data containers in a [`GRanges] object.
pub fn push_range(
Expand Down Expand Up @@ -395,14 +400,14 @@ impl<T> GRanges<VecRanges<RangeIndexed>, T> {
impl<U> GRanges<VecRangesIndexed, Vec<U>> {
/// Create a new [`GRanges<VecRangesIndexed, Vec<U>>`] object from an iterator over
/// [`GenomicRangeRecord<U>`] records.
pub fn from_iter_with_data<I>(
pub fn from_iter<I>(
iter: I,
seqlens: &IndexMap<String, Position>,
) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError>
where
I: Iterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
{
let mut gr = GRanges::new_vec(&seqlens);
let mut gr = GRanges::new_vec(seqlens);
for possible_entry in iter {
let entry = possible_entry?;
gr.push_range(&entry.seqname, entry.start, entry.end, entry.data)?;
Expand All @@ -421,7 +426,7 @@ impl GRangesEmpty<VecRangesEmpty> {
where
I: Iterator<Item = Result<GenomicRangeEmptyRecord, GRangesError>>,
{
let mut gr = GRangesEmpty::new_vec(&seqlens);
let mut gr = GRangesEmpty::new_vec(seqlens);
for possible_entry in iter {
let entry = possible_entry?;
gr.push_range(&entry.seqname, entry.start, entry.end)?;
Expand Down Expand Up @@ -484,15 +489,15 @@ where
pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>(
self,
// right: &GRanges<COITrees<M>, DR>,
right: &'a impl IntoGRangesRef<'a, COITrees<M>, DR>
right: &'a impl AsGRangesRef<'a, COITrees<M>, DR>
) -> Result<GRangesEmpty<VecRangesEmpty>, GRangesError> {
let mut gr = GRangesEmpty::new_vec(&self.seqlens());

let right_ref = right.into_granges_ref();
let right_ref = right.as_granges_ref();

for (seqname, left_ranges) in self.0.ranges.iter() {
for left_range in left_ranges.iter_ranges() {
if let Some(right_ranges) = right_ref.ranges.get(&seqname) {
if let Some(right_ranges) = right_ref.ranges.get(seqname) {
let num_overlaps =
right_ranges.count_overlaps(left_range.start(), left_range.end());
if num_overlaps == 0 {
Expand All @@ -507,7 +512,7 @@ where
}
}

impl<'a, CL, U> GRanges<CL, Vec<U>>
impl<CL, U> GRanges<CL, Vec<U>>
where
CL: IterableRangeContainer,
{
Expand Down Expand Up @@ -540,7 +545,7 @@ where

for (seqname, left_ranges) in self.ranges.iter() {
for left_range in left_ranges.iter_ranges() {
if let Some(right_ranges) = right.ranges.get(&seqname) {
if let Some(right_ranges) = right.ranges.get(seqname) {
let num_overlaps =
right_ranges.count_overlaps(left_range.start(), left_range.end());
if num_overlaps == 0 {
Expand All @@ -558,13 +563,6 @@ where
}
Ok(gr)
}

pub fn filter_overlaps_anti<DR: IndexedDataContainer<'a>>(
&self,
right: &GRanges<COITreesIndexed, DR>,
) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError> {
todo!()
}
}


Expand Down
2 changes: 1 addition & 1 deletion src/io/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fn is_gzipped_file(file_path: impl Into<PathBuf>) -> io::Result<bool> {
/// This struct is used to handle operations on an input file, such as reading from the file.
/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input
/// to be read through a common interface.
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug)]
pub struct InputFile {
pub filepath: PathBuf,
pub comments: Option<Vec<String>>,
Expand Down
2 changes: 1 addition & 1 deletion src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ pub mod noodles;
pub mod parsers;

pub use file::{InputFile, OutputFile};
pub use parsers::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, TsvRecordIterator};
pub use parsers::{Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParsers, TsvRecordIterator};
Loading

0 comments on commit c151fbe

Please sign in to comment.