From 548fbd0e46703603ef9232463e91d075d7983398 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Wed, 14 Feb 2024 21:16:12 -0800 Subject: [PATCH] Basic iterator trait, range pushing methods, to coitrees, and more! - tests and test utilies - rough join stuff - interval/range conversion methods - new GRanges methods refined - iteration methods - read_seqlens - PathBuf-based arguments in io - lazy BED parsing - invalid BED tests - adjust range operation with tests - clippy & fmt - GitHub Rust workflow added --- .github/workflows/rust.yml | 22 +++ Cargo.toml | 28 ++- src/data/mod.rs | 3 + src/data/ndarray.rs | 62 ++++++ src/data/vec.rs | 33 ++++ src/error.rs | 24 ++- src/granges.rs | 177 +++++++++++++---- src/io/io.rs | 191 ++++++++++++++++++ src/io/mod.rs | 9 + src/io/noodles.rs | 12 ++ src/io/parsers.rs | 344 +++++++++++++++++++++++++++++++++ src/iterators.rs | 78 ++++++++ src/join.rs | 43 +++++ src/lib.rs | 52 ++++- src/main/commands.rs | 22 +++ src/main/mod.rs | 71 +++++++ src/ranges/coitrees.rs | 154 ++++++++++++--- src/ranges/mod.rs | 126 +++++++----- src/ranges/operations.rs | 54 ++++++ src/ranges/vec.rs | 35 +++- src/test_utilities.rs | 39 +++- src/traits.rs | 77 ++++++++ tests_data/example.bed | 5 + tests_data/invalid.bed | 5 + tests_data/noodles_example.bed | 2 + 25 files changed, 1529 insertions(+), 139 deletions(-) create mode 100644 .github/workflows/rust.yml create mode 100644 src/data/mod.rs create mode 100644 src/data/ndarray.rs create mode 100644 src/data/vec.rs create mode 100644 src/io/io.rs create mode 100644 src/io/mod.rs create mode 100644 src/io/noodles.rs create mode 100644 src/io/parsers.rs create mode 100644 src/iterators.rs create mode 100644 src/main/commands.rs create mode 100644 src/main/mod.rs create mode 100644 src/ranges/operations.rs create mode 100644 tests_data/example.bed create mode 100644 tests_data/invalid.bed create mode 100644 tests_data/noodles_example.bed diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..31000a2 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,22 @@ +name: Rust + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Build + run: cargo build --verbose + - name: Run tests + run: cargo test --verbose diff --git a/Cargo.toml b/Cargo.toml index d69a2d1..0e635ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,33 @@ [package] -name = "granges2" +name = "granges" version = "0.1.0" edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +license = "MIT" +authors = ["Vince Buffalo "] +keywords = ["genomics", "bioinformatics", "compbio"] +categories = ["science"] +documentation = "https://docs.rs/granges/" +repository = "https://github.com/vsbuffalo/granges" +description = "A Rust library and command line tool for genomic range operations." [dependencies] +# clap = { version = "4.4.18", features = ["derive"], optional = true } +clap = { version = "4.4.18", features = ["derive"] } coitrees = { version = "0.4.0", features = ["nosimd"] } -genomap = "0.1.5" +flate2 = "1.0.28" +genomap = "0.2.6" indexmap = "2.2.3" +ndarray = "0.15.6" +noodles = { version = "0.63.0", features = ["core", "bed"] } rand = "0.8.5" thiserror = "1.0.57" + +# [features] +# cli = [ "clap" ] + +[[bin]] +name = "granges" +path = "src/main/mod.rs" +# required-features = ["cli"] + + diff --git a/src/data/mod.rs b/src/data/mod.rs new file mode 100644 index 0000000..a2966e8 --- /dev/null +++ b/src/data/mod.rs @@ -0,0 +1,3 @@ +//! Data container implementations. + +pub mod vec; diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs new file mode 100644 index 0000000..2885f34 --- /dev/null +++ b/src/data/ndarray.rs @@ -0,0 +1,62 @@ +//! Data container implementations for [`ndarray::Array1`] and [`ndarray::Array2`]. + +use ndarray::{Array1, Array2, ArrayView1}; +use crate::traits::IndexedDataContainer; + +impl<'a, U> IndexedDataContainer<'a> for Array1 +where + U: Copy + Default + 'a, +{ + type Item = U; + type Output = Array1; + + fn get_value(&'a self, index: usize) -> Self::Item { + self[index] + } + + fn len(&self) -> usize { + self.len() + } + + fn is_valid_index(&self, index: usize) -> bool { + index < self.shape()[0] + } + + fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx))) + } +} + +impl<'a, U> IndexedDataContainer<'a> for Array2 +where + U: Copy + Default + 'a, +{ + type Item = ArrayView1<'a, U>; + type Output = Array2; + + fn get_value(&'a self, index: usize) -> Self::Item { + self.row(index) + } + + fn len(&self) -> usize { + self.shape()[0] + } + + fn is_valid_index(&self, index: usize) -> bool { + index < self.shape()[0] + } + + fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + let cols = self.shape()[1]; + + let rows_data: Vec = indices + .iter() + .flat_map(|&idx| self.row(idx).iter().cloned().collect::>()) + .collect(); + + // create a new Array2 from the rows + // shape is (number of indices, number of columns) + Array2::from_shape_vec((indices.len(), cols), rows_data) + .expect("Shape and collected data size mismatch") + } +} diff --git a/src/data/vec.rs b/src/data/vec.rs new file mode 100644 index 0000000..a794fe4 --- /dev/null +++ b/src/data/vec.rs @@ -0,0 +1,33 @@ +//! Data container implementations for [`Vec`]. + + +/// Trait methods for the commonly-used `Vec` data container. +/// +/// Note that the associated `Item` type is always a *reference* to the data elements. +impl<'a, U> IndexedDataContainer<'a> for Vec +where + U: Clone + 'a, +{ + type Item = &'a U; + type Output = Vec; + + fn get_value(&'a self, index: usize) -> Self::Item { + self.get(index).unwrap() + } + + fn len(&self) -> usize { + self.len() + } + + fn is_valid_index(&self, index: usize) -> bool { + self.get(index).is_some() + } + + fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone())) + } +} + + + + diff --git a/src/error.rs b/src/error.rs index 9f636f1..7feabc3 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,13 +1,33 @@ +use std::num::{ParseIntError, ParseFloatError}; + +use genomap::GenomeMapError; use thiserror::Error; use crate::Position; #[derive(Debug, Error)] pub enum GRangesError { + // IO related errors + #[error("File reading eror: {0}")] + IOError(#[from] std::io::Error), + + // File parsing related errors + #[error("Integer parsing error: {0}")] + ParseIntError(#[from] ParseIntError), + #[error("Float parsing error: {0}")] + ParseFloatError(#[from] ParseFloatError), + #[error("Bed-like file has too few columns. The first three columns must be sequence name, and start and end positions.\nLine: {0}")] + BedlikeTooFewColumns(String), + #[error("File has invalid column type entry: {0}")] + InvalidColumnType(String), + + // Invalid genomic range errors #[error("Range invalid: start ({0}) must be greater than end ({1})")] InvalidGenomicRange(Position, Position), - #[error("Range [{0}, {1}] is invalid for sequence of length {2}")] InvalidGenomicRangeForSequence(Position, Position, Position), - + #[error("Sequence name '{0}' is not the ranges container")] + MissingSequence(String), + #[error("Error encountered in genomap::GenomeMap")] + GenomeMapError(#[from] GenomeMapError), } diff --git a/src/granges.rs b/src/granges.rs index 13c9365..e6b679a 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -1,17 +1,27 @@ use genomap::GenomeMap; - -use crate::{traits::RangeContainer, ranges::{vec::{VecRanges, VecRangesIndexed, VecRangesEmpty}, RangeIndexed, RangeEmpty}, Position}; - - +use indexmap::IndexMap; + +use crate::{ + prelude::GRangesError, + ranges::{ + coitrees::{COITrees, COITreesIndexed}, + vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, + RangeEmpty, RangeIndexed, RangeRecord, + }, + traits::{RangeContainer, RangesIterable}, + Position, iterators::GRangesIterator, +}; + +#[derive(Clone, Debug)] pub struct GRanges { - ranges: GenomeMap, - data: Option, + pub(crate) ranges: GenomeMap, + pub(crate) data: Option, } - impl GRanges -where C: RangeContainer { - +where + C: RangeContainer, +{ /// Get the total number of ranges. pub fn len(&self) -> usize { self.ranges.values().map(|ranges| ranges.len()).sum() @@ -21,28 +31,50 @@ where C: RangeContainer { pub fn is_empty(&self) -> bool { self.len() == 0 } -} -impl GRanges> { + /// Get the raw range container. + pub fn get_ranges(&self, seqname: &str) -> Option<&C> { + self.ranges.get(seqname) + } + /// Get the sequence names. + pub fn seqnames(&self) -> Vec { + self.ranges.names() + } +} + +impl GRanges, T> { /// Create a new [`GRanges`] object, with vector storage for ranges and data. /// /// This combination of range and data containers is used when loading data into - /// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's - /// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`] + /// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's + /// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`] /// call the [`Vec`]'s shrink to size methods on the range and data containers /// after data loading to shrink to the minimal necessary size (this can reduce /// memory usage). - pub fn new_vec() -> Self { - let ranges = GenomeMap::new(); - Self { - ranges, - data: None, + pub fn new_vec(seqlens: &IndexMap) -> Self { + let mut ranges = GenomeMap::new(); + for (seqname, length) in seqlens.iter() { + // this should never happen because the error is only if + // insert encounters a seqname that's already been inserted -- that + // cannot happen here. + ranges + .insert(seqname, VecRanges::new(*length)) + .expect("Internal error: please report"); } + Self { ranges, data: None } } +} - - pub fn push_range_with_data(&mut self, seqname: &str, start: Position, end: Position, data: U) { +impl GRanges> { + /// Push a genomic range with its data to the range and data containers in a [`GRanges] object. + pub fn push_range_with_data( + &mut self, + seqname: &str, + start: Position, + end: Position, + data: U, + ) -> Result<(), GRangesError> { // push data to the vec data container, getting the index let index: usize = { let data_container = self.data.get_or_insert_with(Vec::new); @@ -51,39 +83,108 @@ impl GRanges> { }; // push an indexed range let range = RangeIndexed::new(start, end, index); - self.ranges.entry_or_default(seqname).ranges.push(range); + let range_container = self + .ranges + .get_mut(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + range_container.push_range(range); + Ok(()) } } impl GRanges { + /// Push an empty range (no data) to the [`VecRangesEmpty`] range container. + pub fn push_range_empty( + &mut self, + seqname: &str, + start: Position, + end: Position, + ) -> Result<(), GRangesError> { + // push an unindexed (empty) range + let range = RangeEmpty::new(start, end); + let range_container = self + .ranges + .get_mut(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + range_container.push_range(range); + Ok(()) + } +} - /// Create a new [`GRanges`] object, with vector storage for ranges and no data container. - pub fn new_vec_empty() -> Self { - let ranges = GenomeMap::new(); - Self { - ranges, - data: None, +impl GRanges> { + pub fn from_iter( + iter: I, + seqlens: IndexMap, + ) -> Result>, GRangesError> + where + I: Iterator, GRangesError>>, + { + let mut gr = GRanges::new_vec(&seqlens); + for possible_entry in iter { + let entry = possible_entry?; + gr.push_range_with_data(&entry.seqname, entry.start, entry.end, entry.data)?; } + Ok(gr) } +} - /// Push an empty range (no data) to the [`VecRangesEmpty`] range container. - pub fn push_range_only(&mut self, seqname: &str, start: Position, end: Position) { - // push an unindexed (empty) range - let range = RangeEmpty::new(start, end); - self.ranges.entry_or_default(seqname).ranges.push(range); +impl GRanges { + pub fn from_iter_empty( + iter: I, + seqlens: IndexMap, + ) -> Result, GRangesError> + where + I: Iterator, GRangesError>>, + { + let mut gr = GRanges::new_vec(&seqlens); + for possible_entry in iter { + let entry = possible_entry?; + gr.push_range_empty(&entry.seqname, entry.start, entry.end)?; + } + Ok(gr) } } +impl GRanges { + /// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree + /// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library + /// by Daniel C. Jones. + pub fn to_coitrees(self) -> Result, GRangesError> { + let old_ranges = self.ranges; + let mut new_ranges = GenomeMap::new(); + for (seqname, vec_ranges) in old_ranges.into_iter() { + let trees = COITrees::from(vec_ranges); + new_ranges.insert(&seqname, trees)?; + } + Ok(GRanges { + ranges: new_ranges, + data: self.data, + }) + } +} +impl GRanges +where R: RangesIterable { + /// Create a new [`GRangesIterator`] to iterate through all the ranges in this [`GRanges`] object. + pub fn iter_ranges<'a>(&'a self) -> GRangesIterator<'a, R> { + GRangesIterator::new(&self.ranges) + } +} #[cfg(test)] mod tests { - use crate::{prelude::*, test_utilities::random_vecranges}; + use indexmap::indexmap; + + use crate::{ + prelude::*, + test_utilities::{granges_test_case_01, random_vecranges}, + }; #[test] fn test_new_vec() { - let mut gr = GRanges::new_vec(); - gr.push_range_with_data("chr1", 0, 10, 1.1); + let seqlens = indexmap! { "chr1".to_string() => 10}; + let mut gr = GRanges::new_vec(&seqlens); + gr.push_range_with_data("chr1", 0, 10, 1.1).unwrap(); assert_eq!(gr.len(), 1); } @@ -93,4 +194,10 @@ mod tests { assert_eq!(vr.len(), 100) } + #[test] + fn test_to_coitrees() { + let gr_vec = granges_test_case_01(); + let gr = gr_vec.clone().to_coitrees().unwrap(); + assert_eq!(gr.len(), 5); + } } diff --git a/src/io/io.rs b/src/io/io.rs new file mode 100644 index 0000000..f0d62a2 --- /dev/null +++ b/src/io/io.rs @@ -0,0 +1,191 @@ +//! Input/Output file handling with [`InputFile`] and [`OutputFile`]. +//! +//! These types abstract over reading/writing both plaintext and gzip-compressed +//! input/output. + +use flate2::read::GzDecoder; +use flate2::write::GzEncoder; +use flate2::Compression; +use indexmap::IndexMap; +use std::fs::File; +use std::io::Write; +use std::io::{self, BufWriter}; +use std::io::{BufRead, BufReader, Read}; +use std::path::PathBuf; + +use crate::Position; +use crate::error::GRangesError; + +/// Read a tab-delimited *genome file* of sequence (i.e. chromosome) names and their lengths. +pub fn read_seqlens(filepath: impl Into) -> Result, GRangesError> { + let input_file = InputFile::new(filepath); + let reader = input_file.reader()?; + + let mut seqlens = IndexMap::new(); + for result in reader.lines() { + let line = result?; + let mut columns = line.split('\t'); + let seqname = columns.next().unwrap(); + let length: Position = columns.next().unwrap().parse()?; + seqlens.insert(seqname.to_string(), length); + } + Ok(seqlens) +} + +/// Check if a file is a gzipped by looking for the magic numbers +fn is_gzipped_file(file_path: impl Into) -> io::Result { + let mut file = File::open(file_path.into())?; + let mut buffer = [0; 2]; + file.read_exact(&mut buffer)?; + + Ok(buffer == [0x1f, 0x8b]) +} + +/// Represents an input file. +/// +/// This struct is used to handle operations on an input file, such as reading from the file. +/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input +/// to be read through a common interface. +pub struct InputFile { + pub filepath: PathBuf, + pub comments: Option>, + pub header: Option, + pub skip_lines: usize, +} + +impl InputFile { + /// Constructs a new `InputFile`. + /// + /// # Arguments + /// + /// * `filepath` - A string slice that holds the path to the file. If the file extension is + /// `.gz`, `InputFile` will automatically uncompress the input. + pub fn new(filepath: impl Into) -> Self { + Self { + filepath: filepath.into(), + comments: None, + header: None, + skip_lines: 0, + } + } + + /// Opens the file and returns a buffered reader. + /// + /// If the file is gzip-compressed (indicated by a ".gz" extension), this method will + /// automatically handle the decompression. + /// + /// # Returns + /// + /// A result containing a `BufReader>` on success, or a `FileError` on failure. + /// + pub fn reader(&self) -> io::Result>> { + let file = File::open(self.filepath.clone())?; + //let is_gzipped_name = self.filepath.ends_with(".gz"); + let is_gzipped = is_gzipped_file(&self.filepath)?; + let reader: Box = if is_gzipped { + Box::new(GzDecoder::new(file)) + } else { + Box::new(file) + }; + Ok(BufReader::new(reader)) + } + + /// Collects comment lines and/or a line at the start of the file. + pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result { + let mut buf_reader = self.reader()?; + let mut comments = Vec::new(); + let mut line = String::new(); + + while buf_reader.read_line(&mut line)? > 0 { + if line.starts_with(comment) { + comments.push(line.trim_end().to_string()); + self.skip_lines += 1; + } else if let Some(header_string) = header { + if line.starts_with(header_string) { + self.header = Some(line.trim_end().to_string()); + self.skip_lines += 1; + // We only handle one header line. If there are more, the + // file is *very* poorly formatted. So just let downstream + // parsing errors catch this. In the future, we could have a specialized + // error. + break; + } + // break on the first non-header/comment line + break; + } + line.clear(); + } + + self.comments = Some(comments); + Ok(self.skip_lines > 0) + } + + /// Method to continue reading after skipping the comment and header lines. + pub fn continue_reading(&self) -> io::Result>> { + let mut buf_reader = self.reader()?; + let mut skipped_lines = 0; + let mut line = String::new(); + + // skip the lines that were previously read as comments or header + while skipped_lines < self.skip_lines { + buf_reader.read_line(&mut line)?; + skipped_lines += 1; + line.clear(); + } + Ok(buf_reader) + } +} + +/// Represents an output file. +/// +/// This struct is used to handle operations on an output file, such as writing to the file. +/// This abstracts writing both plaintext and gzip-compressed files. +pub struct OutputFile { + pub filepath: PathBuf, + pub header: Option>, +} + +impl OutputFile { + /// Constructs a new `OutputFile`. + /// + /// # Arguments + /// + /// * `filepath` - A string slice that holds the path to the file. If the file extension is + /// `.gz`, `OutputFile` will automatically write gzip-compressed output. + /// * `header` - An optional vector of strings representing commented header lines to be written to the file. + pub fn new(filepath: impl Into, header: Option>) -> Self { + Self { + filepath: filepath.into(), + header, + } + } + + /// Opens the file and returns a writer. + /// + /// If the file path ends with ".gz", the file is treated as gzip-compressed, and the + /// function will handle compression automatically. If a header is set, it will be written + /// to the file. + /// + /// # Returns + /// + /// A result containing a `Box` on success, or an `io::Error` on failure. + pub fn writer(&self) -> io::Result> { + let outfile = &self.filepath; + let is_gzip = outfile.ends_with(".gz"); + let mut writer: Box = if is_gzip { + Box::new(BufWriter::new(GzEncoder::new( + File::create(outfile)?, + Compression::default(), + ))) + } else { + Box::new(BufWriter::new(File::create(outfile)?)) + }; + // write header if one is set + if let Some(entries) = &self.header { + for entry in entries { + writeln!(writer, "#{}", entry)?; + } + } + Ok(writer) + } +} diff --git a/src/io/mod.rs b/src/io/mod.rs new file mode 100644 index 0000000..10df12c --- /dev/null +++ b/src/io/mod.rs @@ -0,0 +1,9 @@ +//! Input/Output +//! + +pub mod io; +pub mod noodles; +pub mod parsers; + +pub use io::{InputFile, OutputFile}; +pub use parsers::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator}; diff --git a/src/io/noodles.rs b/src/io/noodles.rs new file mode 100644 index 0000000..242e308 --- /dev/null +++ b/src/io/noodles.rs @@ -0,0 +1,12 @@ +//! + +use crate::Position; +use noodles::core::Position as NoodlesPosition; + +/// Convert from [`noodles::core::Position`], which is [1-based +/// indexing](https://docs.rs/noodles-core/latest/noodles_core/position/struct.Position.html) +/// to GRanges internal 0-based indexing. +pub fn convert_noodles_position(value: NoodlesPosition) -> Position { + let one_based_position: Position = value.get().try_into().unwrap(); + one_based_position - 1 +} diff --git a/src/io/parsers.rs b/src/io/parsers.rs new file mode 100644 index 0000000..574f0af --- /dev/null +++ b/src/io/parsers.rs @@ -0,0 +1,344 @@ +//! Functionality for general parsing, by turning BED-like files into iterators. +//! + +use noodles::bed::{self}; +use std::collections::HashSet; +use std::io::{BufRead, BufReader, Read}; +use std::path::PathBuf; + +use crate::Position; +use crate::error::GRangesError; +use crate::io::io::InputFile; +use crate::ranges::RangeRecord; +use crate::traits::GeneralRangeRecordIterator; + +use super::noodles::convert_noodles_position; + +pub struct Bed3RecordIterator { + bed_reader: bed::Reader>>, +} + +impl Bed3RecordIterator { + pub fn new(filepath: impl Into) -> Result { + let input_file = InputFile::new(filepath); + let reader = input_file.continue_reading()?; + let bed_reader = bed::Reader::new(reader); + Ok(Self { bed_reader }) + } + + pub fn from_reader(reader: BufReader>) -> Self { + let bed_reader = bed::Reader::new(reader); + Self { bed_reader } + } +} + +impl Iterator for Bed3RecordIterator { + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + self.bed_reader.records::<3>().next().map(|res| { + res.map_err(GRangesError::IOError) + .map(|record| RangeRecord { + seqname: record.reference_sequence_name().to_string(), + start: convert_noodles_position(record.start_position()), + end: convert_noodles_position(record.end_position()), + data: (), + }) + }) + } +} + +impl GeneralRangeRecordIterator<()> for Bed3RecordIterator { + fn retain_seqnames(self, seqnames: Vec) -> FilteredIntervals { + FilteredIntervals::new(self, Some(seqnames), None) + } + + fn exclude_seqnames(self, seqnames: Vec) -> FilteredIntervals { + FilteredIntervals::new(self, None, Some(seqnames)) + } +} + +pub struct TsvRecordIterator { + reader: BufReader>, + parser: F, + phantom: std::marker::PhantomData, +} + +impl TsvRecordIterator +where + F: Fn(&str) -> Result, GRangesError>, +{ + pub fn new(filepath: impl Into, parser: F) -> Result { + let input_file = InputFile::new(filepath); + let reader = input_file.continue_reading()?; + + Ok(Self { + reader, + parser, + phantom: std::marker::PhantomData, + }) + } +} + +impl Iterator for TsvRecordIterator +where + F: Fn(&str) -> Result, GRangesError>, +{ + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + let mut line = String::new(); + match self.reader.read_line(&mut line) { + Ok(0) => None, + Ok(_) => { + let line = line.trim_end(); + Some((self.parser)(line)) + } + Err(e) => Some(Err(GRangesError::IOError(e))), + } + } +} + +/// A BED-like file parser. This works by lazy-parsing the first three +/// columns, which are standard to all BED files. +pub struct BedlikeIterator { + iter: TsvRecordIterator Result, GRangesError>, String>, +} + +impl BedlikeIterator { + pub fn new(filepath: impl Into) -> Result { + // Wrap the parse_bedlike_to_range_record function to conform with TsvRecordIterator's expectations. + let parser: fn(&str) -> Result, GRangesError> = parse_bed_lazy; + + let iter = TsvRecordIterator::new(filepath, parser)?; + Ok(Self { iter }) + } +} + +impl Iterator for BedlikeIterator { + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + + + +/// An iterator over [`IntervalRecord`] items that filters based on sequence name. +/// +/// Note that that the exclude filter is prioritized over the retain filter. So, +/// if a pipeline contains both, then if a chromosome is supplied to +/// [`GeneralIntervalIterator.exclude_seqnames()`], then even if it is in retain, +/// it is skipped. This is to prevent validation and returning a [`Result`]. +/// +/// # Example +/// +/// ``` +/// use granges::prelude::*; +/// use noodles::bed; +/// +/// let iter = Bed3RecordIterator::new("tests_data/example.bed") +/// .expect("error reading file") +/// .exclude_seqnames(vec!["chr1".to_string()]); +/// +/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 }; +/// let gr = GRanges::from_iter(iter, seqlens) +/// .expect("parsing error"); +/// let mut iter = gr.iter_ranges(); +/// +/// // the first range should be the third range in the file, +/// // chr2:4-5 +/// assert_eq!(iter.next().unwrap().start, 4); +/// assert_eq!(iter.next().unwrap().end, 5); +/// ``` +pub struct FilteredIntervals +where + I: Iterator, GRangesError>>, +{ + inner: I, + retain_seqnames: Option>, + exclude_seqnames: Option>, +} + +impl FilteredIntervals +where + I: Iterator, GRangesError>>, +{ + pub fn new( + inner: I, + retain_seqnames: Option>, + exclude_seqnames: Option>, + ) -> Self { + let retain_seqnames = retain_seqnames.map(HashSet::from_iter); + let exclude_seqnames = exclude_seqnames.map(HashSet::from_iter); + Self { + inner, + retain_seqnames, + exclude_seqnames, + } + } +} + +impl Iterator for FilteredIntervals +where + I: Iterator, GRangesError>>, +{ + type Item = Result, GRangesError>; + + /// Get the next filtered entry, prioritizing exclude over retain. + fn next(&mut self) -> Option { + for item in self.inner.by_ref() { + match &item { + Ok(entry) => { + if self + .exclude_seqnames + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } + if self + .retain_seqnames + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } + } + Err(_) => return Some(item), + } + } + None + } +} + +/// Parses a single column from a string slice into a specified type. +/// +/// This function is particularly useful for converting columns in genomic data files +/// from string representation to a specific type like `usize`, `f64`, etc. +/// +/// # Type Parameters +/// +/// * `T`: The type to which the column should be parsed. It must implement `FromStr`. +/// +/// # Arguments +/// +/// * `column`: The string slice representing the column to be parsed. +/// * `line`: The entire line from which the column was extracted. Used for error reporting. +/// +/// # Errors +/// +/// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`. +pub fn parse_column(column: &str, line: &str) -> Result +where + ::Err: std::fmt::Debug, +{ + // NOTE: this is used a lot, and should be benchmarked. + column + .parse::() + .map_err(|_| GRangesError::InvalidColumnType(format!("column '{}' in '{}'", column, line))) +} + +/// Parses a BED-like format line into its constituent components. +/// +/// BED-like formats contain BED3 columns (chromosome, start, and end) and +/// others. It's an non-standard but extensible (since it's just a TSV), and sensible +/// for a variety of genomic data. +/// +/// # Arguments +/// +/// * `line`: A string slice representing a line in BED3 format. +/// +/// # Errors +/// +/// Returns `GRangesError::InvalidColumnType` if the line does not have enough columns +/// or if any column cannot be correctly parsed. +pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str>), GRangesError> { + let columns: Vec<&str> = line.split('\t').collect(); + if columns.len() < 3 { + return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); + } + + let seqname = parse_column(columns[0], line)?; + let start: Position = parse_column(columns[1], line)?; + let end: Position = parse_column(columns[2], line)?; + + // Collect remaining columns + let additional_columns = columns[3..].to_vec(); + + Ok((seqname, start, end, additional_columns)) +} + +/// Lazily parses a BED* format line into the first three columns defining the range, +/// storing the rest as a `String`. +pub fn parse_bed_lazy(line: &str) -> Result, GRangesError> { + let columns: Vec<&str> = line.splitn(4, '\t').collect(); + if columns.len() < 3 { + return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); + } + + let seqname = parse_column(columns[0], line)?; + let start: Position = parse_column(columns[1], line)?; + let end: Position = parse_column(columns[2], line)?; + + let data = columns[3].to_string(); + + Ok(RangeRecord { + seqname, + start, + end, + data, + }) +} + +#[cfg(test)] +mod tests { + use crate::prelude::*; + + use super::BedlikeIterator; + + #[test] + fn test_parser() { + // based on this example: https://docs.rs/noodles-bed/latest/noodles_bed/struct.Reader.html#method.records + let iter = Bed3RecordIterator::new("tests_data/noodles_example.bed").unwrap(); + + let seqlens = seqlens! { "sq0" => 10 }; + let gr: GRanges = GRanges::from_iter_empty(iter, seqlens).unwrap(); + + assert_eq!(gr.len(), 2); + + // let mut gr_iter = gr.iter_ranges(); + // let first_interval = gr_iter.next().unwrap(); + // assert_eq!(first_interval.first, 7); + // assert_eq!(first_interval.last, 12); + // + // let second_interval = gr_iter.next().unwrap(); + // assert_eq!(second_interval.first, 20); + // assert_eq!(second_interval.last, 33); + } + + #[test] + #[should_panic] + fn test_invalid_bed_noodles() { + let iter = Bed3RecordIterator::new("tests_data/invalid.bed").unwrap(); + + let seqlens = seqlens! { "sq0" => 10 }; + let _gr: GRanges = GRanges::from_iter_empty(iter, seqlens).unwrap(); + } + + #[test] + fn test_invalid_bedlike_iterator() { + let iter = BedlikeIterator::new("tests_data/invalid.bed").unwrap(); + let seqlens = seqlens! { "sq0" => 10 }; + let result: Result, _> = GRanges::from_iter(iter, seqlens); + dbg!(&result); + // note: the Rust LSP thinks this isn't used for some reason, so prefaced with _ + // to silence warnings. + let _msg = "column '-1' in 'chr1\t-1\t20'".to_string(); + assert!(matches!( + result, + Err(GRangesError::InvalidColumnType(_msg)) + )); + } +} diff --git a/src/iterators.rs b/src/iterators.rs new file mode 100644 index 0000000..d032d78 --- /dev/null +++ b/src/iterators.rs @@ -0,0 +1,78 @@ +use genomap::GenomeMap; + +use crate::{ + ranges::{RangeIndexed, RangeIndexedRecord}, + traits::{RangeContainer, RangesIterable}, +}; + +/// An iterator over [`RangeIndexedRecord`], which store +/// indices to the sequence names and data container. +/// +/// # Developer Notes +/// Using indices, rather than references to data items and `&str` directly, +/// prevents lifetime complexity. +pub struct GRangesIterator<'a, R> { + ranges: &'a GenomeMap, + current_seqname_index: usize, + current_range_iter: Box + 'a>, +} + +impl<'a, R> GRangesIterator<'a, R> +where + R: RangesIterable, +{ + pub fn new(ranges: &'a GenomeMap) -> Self { + let current_range_iter = ranges.get_by_index(0).unwrap().iter_ranges(); + Self { + ranges, + current_seqname_index: 0, + current_range_iter, + } + } +} + +impl<'a, R> Iterator for GRangesIterator<'a, R> +where + R: RangeContainer + RangesIterable, +{ + type Item = RangeIndexedRecord; + + fn next(&mut self) -> Option { + loop { + if let Some(next_range) = self.current_range_iter.next() { + return Some(RangeIndexedRecord { + seqname_index: self.current_seqname_index, + start: next_range.start, + end: next_range.end, + index: next_range.index, + }); + } else { + // try to load another sequence's set of ranges. + self.current_seqname_index += 1; + if self.current_seqname_index >= self.ranges.len() { + // we're out of range container iterators + return None; + } + self.current_range_iter = self + .ranges + .get_by_index(self.current_seqname_index) + .unwrap() + .iter_ranges(); + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ranges::RangeIndexedRecord, test_utilities::granges_test_case_01}; + + use super::GRangesIterator; + + #[test] + fn test_genomic_ranges_iterator() { + let gr = granges_test_case_01(); + let mut iter = GRangesIterator::new(&gr.ranges); + assert_eq!(iter.next().unwrap(), RangeIndexedRecord::new(0, 0, 5, 0)); + } +} diff --git a/src/join.rs b/src/join.rs index 139597f..a381a51 100644 --- a/src/join.rs +++ b/src/join.rs @@ -1,2 +1,45 @@ +use std::rc::Rc; +use crate::Position; +pub struct JoinData { + /// The data index for the left range. + left: usize, + + /// A `Vec` of the indices for the overlapping right ranges. + rights: Vec, + + /// The length of the left range. + left_length: Position, + + /// The lengths of the right ranges. + right_lengths: Vec, + + /// The lengths of the overlaps between the left and right ranges. + overlaps: Vec, + + // TODO: we may want some simple summary of whether something is + // up or downstream. I think the cleanest summary is a signed integer + // representing the side and degree of non-overlap. E.g. a range + // that overlaps another but overhangs the 3' side of the focal left + // range by 10bp is +10; if it were 5', it would be -10. + /// A possible reference to the left data of type `T`. + left_data: Option>, + + /// A possible reference to the right data of type `T`. + right_data: Option>, +} + +pub struct JoinIterator { + left_data: Option>, + right_data: Option>, +} +// +// impl JoinIterator { +// +// } +// +// +// pub fn left_join(left: GRanges, right: GRanges) -> JoinIterator { +// +// } diff --git a/src/lib.rs b/src/lib.rs index cb1be7a..5620e6e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,57 @@ +// Copyright (2024) Vince Buffalo +#![crate_name = "granges"] +#![doc(html_root_url = "https://docs.rs/granges/")] + +pub use indexmap; -pub mod traits; -pub mod ranges; -pub mod granges; pub mod error; +pub mod granges; +pub mod io; +pub mod iterators; +pub mod join; +pub mod ranges; pub mod test_utilities; +pub mod traits; pub type Position = u32; +pub type PositionOffset = i32; // signed variant pub mod prelude { - pub use crate::granges::GRanges; pub use crate::error::GRangesError; + pub use crate::granges::GRanges; + pub use crate::io::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator}; + + pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; + pub use crate::traits::{GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable}; + + pub use crate::seqlens; +} + +/// Create and [`indexmap::IndexMap`] of sequence names and their lengths. +#[macro_export] +macro_rules! seqlens { + ($($key:expr => $value:expr),* $(,)?) => { + $crate::indexmap::indexmap!($($key.to_string() => $value),*) + }; +} + +/// Create a new `GRanges` with sequence length information (used primarily for small examples) +#[macro_export] +macro_rules! create_granges_with_seqlens { + ($range_type:ty, $data_type:ty, { $($chr:expr => [$(($start:expr, $end:expr, $data:expr)),*]),* }, seqlens: { $($chr_len:expr => $len:expr),* }) => { + { + let mut seqlens = ::indexmap::IndexMap::new(); + $(seqlens.insert($chr_len.to_string(), $len);)* + + let mut gr: GRanges<$range_type, $data_type> = GRanges::new_vec(&seqlens); + + $( + $( + gr.push_range_with_data(&$chr.to_string(), $start, $end, $data).expect("Failed to push range"); + )* + )* + + gr + } + }; } diff --git a/src/main/commands.rs b/src/main/commands.rs new file mode 100644 index 0000000..a71e774 --- /dev/null +++ b/src/main/commands.rs @@ -0,0 +1,22 @@ +use std::path::PathBuf; + +use granges::io::io::read_seqlens; +use granges::prelude::*; + + + +/// Adjust the genomic ranges in a bedfile by some specified amount. +// NOTE: we don't do build the full GRanges objects here, for efficiency. +// But it would be a good benchmark to see how much slower that is. +pub fn granges_adjust(bedfile: &PathBuf, seqlens: &PathBuf, both: &i32, output: Option<&PathBuf>) -> Result<(), GRangesError> { + let genome = read_seqlens(seqlens)?; + + let bedlike_iterator = BedlikeIterator::new(bedfile)?; + + for record in bedlike_iterator { + let range = record?; + let seqname = range.seqname; + let length = *genome.get(&seqname).ok_or(GRangesError::MissingSequence(seqname))?; + } + Ok(()) +} diff --git a/src/main/mod.rs b/src/main/mod.rs new file mode 100644 index 0000000..c944c02 --- /dev/null +++ b/src/main/mod.rs @@ -0,0 +1,71 @@ +use std::path::PathBuf; + +use clap::{Parser, Subcommand}; +use granges::prelude::GRangesError; + +pub mod commands; +use crate::commands::granges_adjust; + +const INFO: &str = "\ +granges: genomic range operations built off of the GRanges library +usage: granges [--help] + +Subcommands: + + adjust: adjust each genomic range, e.g. to add a kilobase to each end. + +"; + +#[derive(Parser)] +#[clap(name = "granges")] +#[clap(about = INFO)] +struct Cli { + #[arg(short, long, action = clap::ArgAction::Count)] + debug: u8, + + #[command(subcommand)] + command: Option, +} + +#[derive(Subcommand)] +enum Commands { + Adjust { + /// a TSV genome file of chromosome names and their lengths + #[arg(long, required = true)] + seqlens: PathBuf, + /// an input BED-like TSV file + #[arg(required = true)] + bedfile: PathBuf, + /// number of basepairs to expand the range start and end positions by + #[arg(long)] + both: i32, + /// an optional output file (standard output will be used if not specified) + output: Option, + }, +} + +fn run() -> Result<(), GRangesError> { + let cli = Cli::parse(); + match &cli.command { + Some(Commands::Adjust { + bedfile, + seqlens, + both, + output, + }) => granges_adjust(bedfile, seqlens, both, output.as_ref()), + None => { + println!("{}\n", INFO); + std::process::exit(1); + } + } +} + +fn main() { + match run() { + Ok(_) => {} + Err(e) => { + eprintln!("Error: {}", e); + std::process::exit(1); + } + } +} diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index 2e9d041..78aa8ad 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -1,49 +1,75 @@ -use coitrees::{Interval, BasicCOITree, IntervalTree, IntervalNode, GenericInterval}; +use coitrees::{BasicCOITree, GenericInterval, Interval, IntervalNode, IntervalTree}; -use crate::{Position, traits::RangeContainer, error::GRangesError}; +use crate::{error::GRangesError, traits::{RangeContainer}, traits::RangesIterable, Position}; -use super::{vec::VecRanges, RangeIndexed, validate_range}; +use super::{validate_range, vec::VecRanges, RangeEmpty, RangeIndexed}; -type COITreeIntervalIndexed = Interval; +pub type COITreesIndexed = COITrees; + +impl GenericInterval<()> for RangeEmpty { + fn first(&self) -> i32 { + self.start.try_into().unwrap() + } + fn last(&self) -> i32 { + self.end.try_into().unwrap() + } + fn metadata(&self) -> &() { + &() + } +} impl GenericInterval for RangeIndexed { fn first(&self) -> i32 { - self.start().try_into().unwrap() + self.start.try_into().unwrap() } fn last(&self) -> i32 { - self.end().try_into().unwrap() + self.end.try_into().unwrap() } fn metadata(&self) -> &usize { - self.index() + &self.index } } /// A [`coitrees::BasicCOITree`] interval tree for a single sequence's ranges. /// -/// This is generic over the interval type, to handle the case where one -/// may want to do overlap operations on ranges without associated data in +/// This is generic over the interval type, to handle the case where one +/// may want to do overlap operations on ranges without associated data in /// a data container (e.g. ranges that just define megabase windwows). -pub struct COITreeRangeContainer { - ranges: BasicCOITree, +/// +pub struct COITrees { + pub(crate) ranges: BasicCOITree, /// The sequence length, used to validate new ranges. - length: Position, + pub length: Position, } -impl COITreeRangeContainer { - pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { - let range = start..end; - validate_range(&range, self.length) +impl std::fmt::Debug for COITrees { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("COITrees") + .field("number of ranges:", &self.ranges.len()) + .field("length", &self.length) + .finish() } +} - pub fn query(&self, start: Position, end: Position, visit: F) - where F: FnMut(&IntervalNode) { - // Note the terminology change to match coitrees (and uses i32s) - let first = start.try_into().expect("could not covert"); - let end: i32 = end.try_into().expect("could not covert"); - // internally coitrees uses 0-indexed, right-inclusive "last" - self.ranges.query(first, end - 1, visit) +impl COITrees { + /// Validate a range, raising an error if it is invalid for some reason. + pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { + validate_range(start, end, self.length) } + /// Query this range container for a particular range, and call a visit function on all + /// overlapping ranges. + pub fn query(&self, start: Position, end: Position, visit: F) + where + F: FnMut(&IntervalNode), + { + // Note the terminology change to match coitrees (and uses i32s) + let first = start.try_into().expect("could not covert"); + let end: i32 = end.try_into().expect("could not covert"); + // internally coitrees uses 0-indexed, right-inclusive "last" + self.ranges.query(first, end - 1, visit) + } + /// Return the number of ranges in this [`COITreeRangeContainer`] container. pub fn len(&self) -> usize { self.ranges.len() @@ -53,22 +79,92 @@ impl COITreeRangeContainer { pub fn is_empty(&self) -> bool { self.len() == 0 } - } -impl> From> for COITreeRangeContainer { +/// Convert a [`VecRanges`] range container to a [`COITrees`] range container. +impl, M: Clone> From> for COITrees { fn from(value: VecRanges) -> Self { let ranges = BasicCOITree::new(&value.ranges); let length = value.length; - Self { - ranges, - length + Self { ranges, length } + } +} + +/// Convert a [`COITrees`] range container to a [`VecRanges`] range container. +impl From> for VecRanges { + fn from(value: COITrees) -> Self { + let length = value.length; + let mut ranges: VecRanges = VecRanges::new(length); + for interval in value.ranges.iter() { + ranges.push_range(interval.into()); } + ranges } } -impl RangeContainer for COITreeRangeContainer { +/// [`RangeContainer`] trait implementations. +impl RangeContainer for COITrees { fn len(&self) -> usize { self.ranges.len() } } + +/// Convert between [`coitrees::Interval`] with index metadata to a [`RangeEmpty`]. +impl From> for RangeEmpty { + fn from(value: Interval<&()>) -> Self { + RangeEmpty { + start: value.first.try_into().unwrap(), + end: value.last.try_into().unwrap(), + } + } +} + +/// Convert between [`coitrees::Interval`] with index metadata to a [`RangeIndexed`]. +impl From> for RangeIndexed { + fn from(value: Interval<&usize>) -> Self { + RangeIndexed { + start: value.first.try_into().unwrap(), + end: value.last.try_into().unwrap(), + index: *value.metadata(), + } + } +} + +/// # Developer Notes +/// +/// Internally, the [`coitrees`] iterator is over their interval type. +/// Their iterator does not consume, but we need an owned (or copyable, +/// as is case here) type to map out. Thus, we need this odd variant of +/// an iterator that doesn't return references and does not consume. +impl RangesIterable for COITrees { + fn iter_ranges(&self) -> Box + '_> { + let iter = self.ranges.iter(); + let converted_iter = iter.map(|interval| RangeIndexed::from(interval)); + Box::new(converted_iter) + } +} + +impl RangesIterable for COITrees<()> { + fn iter_ranges(&self) -> Box + '_> { + let iter = self.ranges.iter(); + let converted_iter = iter.map(|interval| RangeEmpty::from(interval)); + Box::new(converted_iter) + } +} + +#[cfg(test)] +mod tests { + use crate::prelude::*; + use crate::ranges::RangeIndexed; + use crate::test_utilities::granges_test_case_01; + + #[test] + fn test_ranges_iterable_coitrees() { + let gr = granges_test_case_01().to_coitrees().unwrap(); + let mut chr1_iter = gr.get_ranges("chr1").unwrap().iter_ranges(); + assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(0, 5, 0)); + assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(4, 7, 1)); + assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(10, 17, 2)); + assert!(chr1_iter.next().is_none()); + } +} diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 1e37bcf..d04742b 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -1,64 +1,74 @@ -use std::ops::Range; +//! Range and Range Containers. +//! +//! -use crate::{Position, error::GRangesError}; +use crate::{error::GRangesError, Position}; pub mod coitrees; pub mod vec; +pub mod operations; - -#[derive(Clone, Default)] +#[derive(Clone, Debug, Default, PartialEq)] pub struct RangeEmpty { - range: Range, + pub start: Position, + pub end: Position, } +unsafe impl Sync for RangeEmpty {} +unsafe impl Send for RangeEmpty {} + impl RangeEmpty { /// Create a new 0-indexed right-exclusive range. pub fn new(start: Position, end: Position) -> Self { - let start = start.try_into().unwrap(); - let end = end.try_into().unwrap(); - Self { - range: start..end, - } - } - - pub fn start(&self) -> Position { - self.range.start - } - - pub fn end(&self) -> Position { - self.range.end + Self { start, end } } } -#[derive(Clone, Debug, Default)] +/// [`RangeIndexed`] is a range with a valid +/// index to a data element in the data container. +#[derive(Clone, Debug, Default, PartialEq)] pub struct RangeIndexed { - range: Range, - index: usize, + pub start: Position, + pub end: Position, + pub index: usize, } +unsafe impl Sync for RangeIndexed {} +unsafe impl Send for RangeIndexed {} + impl RangeIndexed { /// Create a new 0-indexed right-exclusive range. pub fn new(start: Position, end: Position, index: usize) -> Self { - let start = start.try_into().unwrap(); - let end = end.try_into().unwrap(); - Self { - range: start..end, - index - } + Self { start, end, index } } +} - pub fn start(&self) -> Position { - self.range.start - } +/// Represents a parsed range entry, possibly containing some data. +#[derive(Debug, Clone, PartialEq)] +pub struct RangeRecord { + pub seqname: String, + pub start: Position, + pub end: Position, + pub data: U, +} - pub fn end(&self) -> Position { - self.range.end - } +/// Represents a range entry, possible with indices to sequence name and data. +#[derive(Debug, Clone, PartialEq)] +pub struct RangeIndexedRecord { + pub seqname_index: usize, + pub start: Position, + pub end: Position, + pub index: usize, +} - // Note: this returning a reference is required to - // implement coitrees's GenericInterval trait. - pub fn index(&self) -> &usize { - &self.index +impl RangeIndexedRecord { + pub fn new(seqname_index: usize, start: Position, end: Position, index: usize) -> Self { + Self { + seqname_index, + start, + end, + index, + } } } @@ -72,33 +82,49 @@ impl RangeIndexed { /// # Returns /// /// * `bool` - `true` if the range is valid for the sequence; otherwise, `false`. -pub fn validate_range(range: &std::ops::Range, length: Position) -> -Result<(), GRangesError> { - let start = range.start; - let end = range.start; - dbg!(&start); - dbg!(&end); +pub fn validate_range( + start: Position, + end: Position, + length: Position, +) -> Result<(), GRangesError> { if start > end { - GRangesError::InvalidGenomicRange(start, end); + return Err(GRangesError::InvalidGenomicRange(start, end)); } if end >= length { - GRangesError::InvalidGenomicRangeForSequence(start, end, length); + return Err(GRangesError::InvalidGenomicRangeForSequence( + start, end, length, + )); } Ok(()) } #[cfg(test)] mod tests { - use crate::prelude::*; use super::validate_range; + use crate::prelude::*; #[test] fn test_invalid_range_start_end() { - let range = 10..1; - let result = validate_range(&range, 10); - dbg!(&range); - assert!(matches!(result, Err(GRangesError::InvalidGenomicRange(10, 0)))); + let result = validate_range(5, 1, 10); + assert!(matches!( + result, + Err(GRangesError::InvalidGenomicRange(5, 1)) + )); + } + + #[test] + fn test_valid_range_length() { + let result = validate_range(1, 10, 11); + assert!(result.is_ok()); + } + #[test] + fn test_invalid_range_length() { + let result = validate_range(1, 10, 10); + assert!(matches!( + result, + Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10)) + )); } } diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs new file mode 100644 index 0000000..1c7feb7 --- /dev/null +++ b/src/ranges/operations.rs @@ -0,0 +1,54 @@ +//! Range Operations. +//! +//! - [`adjust()`]: Adjust range start and end positions. + +use crate::{PositionOffset, Position}; + +use super::RangeIndexed; + +/// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] +/// and returning `None` if the range has zero width after adjustment. +pub fn adjust(range: RangeIndexed, start_delta: PositionOffset, end_delta: PositionOffset, length: Position) -> Option { + let start: PositionOffset = range.start.try_into().unwrap(); + let end: PositionOffset = range.end.try_into().unwrap(); + let length: PositionOffset = length.try_into().unwrap(); + + // ensure within [0, length] + let new_start = (start + start_delta).max(0).min(length); + // ensure new_end >= new_start and within [0, length] + let new_end = (end + end_delta).max(new_start).min(length); + + // check for zero-width range + if new_end <= new_start { + // return None if the range has zero width + None + } else { + Some(RangeIndexed::new(new_start.try_into().unwrap(), new_end.try_into().unwrap(), range.index)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_normal_adjustment() { + let range = RangeIndexed::new(5, 10, 1); + let adjusted = adjust(range, -2, 3, 15).unwrap(); + assert_eq!(adjusted, RangeIndexed::new(3, 13, 1)); + } + + #[test] + fn test_out_of_bounds_adjustment() { + let range = RangeIndexed::new(10, 12, 2); + let adjusted = adjust(range, -5, 20, 15).unwrap(); + assert_eq!(adjusted, RangeIndexed::new(5, 15, 2)); + } + + #[test] + fn test_zero_width_result() { + let range = RangeIndexed::new(5, 10, 3); + assert!(adjust(range, 5, -5, 15).is_none()); + } +} + diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index efc8026..0f0dea2 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -1,21 +1,17 @@ -use crate::{traits::RangeContainer, Position, error::GRangesError}; +use super::{validate_range, RangeEmpty, RangeIndexed}; +use crate::traits::{RangesIntoIterable, RangesIterable}; +use crate::{error::GRangesError, traits::RangeContainer, Position}; -use super::{RangeIndexed, validate_range, RangeEmpty}; pub type VecRangesIndexed = VecRanges; pub type VecRangesEmpty = VecRanges; -#[derive(Clone, Default)] +#[derive(Clone, Debug)] pub struct VecRanges { - pub (crate) ranges: Vec, + pub(crate) ranges: Vec, pub length: Position, } impl VecRanges { - pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { - let range = start..end; - validate_range(&range, self.length) - } - /// Create a new empty [`VecRanges`] container. pub fn new(length: Position) -> Self { Self { @@ -24,6 +20,11 @@ impl VecRanges { } } + /// Validate a range, raising an error if it is invalid for some reason. + pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> { + validate_range(start, end, self.length) + } + /// Add a new range to the [`VecRanges`] container. pub fn push_range(&mut self, range: R) { self.ranges.push(range) @@ -45,3 +46,19 @@ impl RangeContainer for VecRanges { self.ranges.len() } } + +impl RangesIntoIterable for VecRanges { + fn into_iter_ranges(self) -> Box> { + let iter = self.ranges.into_iter(); + Box::new(iter) + } +} + +impl RangesIterable for VecRanges { + fn iter_ranges(&self) -> Box + '_> { + let iter = self.ranges.iter(); + // NOTE: RangeIndexed is copyable but there still is overhead here + let converted_iter = iter.map(|interval| interval.to_owned()); + Box::new(converted_iter) + } +} diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 856e5c2..4b330c0 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -1,9 +1,13 @@ //! Test cases and test utility functions. //! -use indexmap::IndexMap; +use crate::{ + create_granges_with_seqlens, + prelude::{GRanges, VecRangesIndexed}, + ranges::{coitrees::COITrees, vec::VecRanges, RangeEmpty}, + Position, +}; use rand::{thread_rng, Rng}; -use crate::{Position, ranges::{RangeIndexed, vec::VecRanges, RangeEmpty}}; // Stochastic test ranges defaults // @@ -34,8 +38,7 @@ pub fn random_range(chrom_len: Position) -> (Position, Position) { /// Build random sequence lengths pub fn random_seqlen() -> Position { let mut rng = thread_rng(); - let rand_len = rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN); - rand_len + rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN) } /// Sample a random chromosome @@ -56,6 +59,30 @@ pub fn random_vecranges(n: usize) -> VecRanges { vr } +/// Build random [`COITrees`] from a random [`VecRanges`]. +pub fn random_coitrees() -> COITrees<()> { + let vr = random_vecranges(100); + let cr: COITrees<()> = vr.into(); + cr +} - - +/// Range test case #1 +/// +/// Ranges: +/// - chr1: +/// (0, 5, Some(1.1)) +/// (4, 7, Some(8.1)) +/// (10, 17, Some(10.1)) +/// - chr2: +/// (10, 20, Some(3.7)) +/// (18, 32, Some(1.1)) +/// +/// Seqlens: { "chr1" => 30, "chr2" => 100 } +/// +/// Sum of all elements: 24.1 +pub fn granges_test_case_01() -> GRanges> { + create_granges_with_seqlens!(VecRangesIndexed, Vec, { + "chr1" => [(0, 5, 1.1), (4, 7, 8.1), (10, 17, 10.1)], + "chr2" => [(10, 20, 3.7), (18, 32, 1.1)] + }, seqlens: { "chr1" => 30, "chr2" => 100 }) +} diff --git a/src/traits.rs b/src/traits.rs index f473fb0..b7999ee 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1,7 +1,84 @@ +//! Traits used by the GRanges library. +//! +use crate::{error::GRangesError, io::parsers::FilteredIntervals, ranges::RangeRecord}; + + +/// Defines functionality common to all range containers, e.g. [`VecRanges`] and +/// [`COITrees`]. pub trait RangeContainer { fn len(&self) -> usize; fn is_empty(&self) -> bool { self.len() == 0 } } + +/// Defines functionality for filtering [`RangeRecord`] entries based on their +/// sequence names. [`RangeRecord`] are predominantly used in reading in data, +/// so these trait methods simplify excluding or retaining ranges based on what +/// sequence (i.e. chromosome) they are on. +pub trait GeneralRangeRecordIterator: + Iterator, GRangesError>> + Sized +{ + fn retain_seqnames(self, seqnames: Vec) -> FilteredIntervals; + fn exclude_seqnames(self, seqnames: Vec) -> FilteredIntervals; +} + +/// The [`RangesIterable`] trait defines common functionality for iterating over +/// the range types in range containers. +pub trait RangesIterable { + fn iter_ranges(&self) -> Box + '_>; +} + +/// The [`RangesIntoIterable`] trait defines common functionality for *consuming* iterating +/// over the range types in range containers. +pub trait RangesIntoIterable { + fn into_iter_ranges(self) -> Box>; +} + +/// The main [`IndexedDataContainer`] type is used to interact +/// with arbitrary *indexed* container of data of type `DataContainer::Item`. +/// +/// GRanges provides [`IndexedDataContainer`] trait implementations for: +/// +/// - [`ndarray::Array1`] +/// - [`ndarray::Array2`] +/// - [`Vec`] +/// - [`polars::DataFrame`] +/// +/// # Panics +/// This will panic if `DataContainer.get_value()` tries to access +/// an index that does not exist. This is an explicit design choice: +/// we could have returned an `Option` or a `Result`. However, +/// when the index is invalid, there is a serious problem in the creation +/// of the `GRanges` object's set of ranges; this should not ever +/// happen in normal operation. Returning `Option`/`Result` just to +/// protect against this edge case would force most `GRanges` +/// functions to return a `Result`. This would clog the API usage, so +/// we opt to just panic. +pub trait IndexedDataContainer<'a> { + // note: I don't think we can refactor out this lifetime, + // since it is needed often for associated types + type Item; + type Output; + // this type is needed to reference the core underlying type, + // eg to handle reference types + fn is_valid_index(&self, index: usize) -> bool; + fn get_value(&'a self, index: usize) -> ::Item; + fn len(&self) -> usize; + fn is_empty(&self) -> bool { + self.len() == 0 + } + fn invalid_indices(&self, indices: &[usize]) -> Vec { + let mut invalid = Vec::new(); + for &index in indices { + if !self.is_valid_index(index) { + invalid.push(index); + } + } + invalid + } + + /// Create a new data container using a set of range indices. + fn new_from_indices(&self, indices: &[usize]) -> Self::Output; +} diff --git a/tests_data/example.bed b/tests_data/example.bed new file mode 100644 index 0000000..0f40e10 --- /dev/null +++ b/tests_data/example.bed @@ -0,0 +1,5 @@ +chr1 10 20 +chr1 14 18 +chr2 4 5 +chr2 5 6 +chr4 9 15 diff --git a/tests_data/invalid.bed b/tests_data/invalid.bed new file mode 100644 index 0000000..28d3141 --- /dev/null +++ b/tests_data/invalid.bed @@ -0,0 +1,5 @@ +chr1 -1 20 +chr1 14 18 +chr2 4 5 +chr2 5 6 +chr4 9 15 diff --git a/tests_data/noodles_example.bed b/tests_data/noodles_example.bed new file mode 100644 index 0000000..eccc0b9 --- /dev/null +++ b/tests_data/noodles_example.bed @@ -0,0 +1,2 @@ +sq0 7 13 +sq0 20 34