From 2b8aa5a9f241a099f9f293d3cdb40c2af0b3ccd9 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Thu, 22 Feb 2024 23:49:51 -0800 Subject: [PATCH] Doc-writing session. - Added lots of module, type, and function level documentation. - F'ing sick ASCII diagrams. --- Cargo.toml | 6 +- src/commands.rs | 257 +++++++++++++++++++++++------------- src/data/mod.rs | 1 + src/error.rs | 10 +- src/granges.rs | 78 +++++++---- src/io/file.rs | 22 ++-- src/io/mod.rs | 6 +- src/io/noodles.rs | 16 --- src/io/parsers.rs | 277 ++++++++++++++++++++++----------------- src/iterators.rs | 7 +- src/join.rs | 12 +- src/lib.rs | 114 +++++++++++++++- src/main/mod.rs | 4 +- src/ranges/coitrees.rs | 8 +- src/ranges/mod.rs | 2 +- src/ranges/operations.rs | 8 +- src/ranges/vec.rs | 2 + src/reporting.rs | 11 +- src/test_utilities.rs | 2 +- src/traits.rs | 14 +- 20 files changed, 562 insertions(+), 295 deletions(-) delete mode 100644 src/io/noodles.rs diff --git a/Cargo.toml b/Cargo.toml index d67871c..fb5ac8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,10 +22,14 @@ noodles = { version = "0.63.0", features = ["core", "bed"] } rand = "0.8.5" tempfile = "3.10.0" thiserror = "1.0.57" +polars = { version = "0.37.0", optional = true } [features] -# cli = [ "clap" ] // TODO make feature +# cli = [ "clap" ] # TODO make feature dev-commands = [ ] +test_bigranges = [] # NOTE: not used yet, for tests on large files +polars = ["dep:polars"] + [profile.release] opt-level = 3 diff --git a/src/commands.rs b/src/commands.rs index 9440239..e239b26 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,7 +1,9 @@ +//! Command functions that implement each of the `granges` subcommands. + use std::path::PathBuf; use crate::{ - io::{parsers::GenomicRangesParser, OutputFile}, + io::{parsers::GenomicRangesParser, OutputStream}, prelude::*, ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord}, reporting::{CommandOutput, Report}, @@ -10,13 +12,37 @@ use crate::{ Position, PositionOffset, }; +/// An `enum` to indicate whether an streaming or in-memory algorithm should be used. #[derive(Clone)] pub enum ProcessingMode { Streaming, InMemory, } -/// Adjust the genomic ranges in a bedfile by some specified amount. +/// Adjusts genomic ranges in a BED file by a specified amount. +/// +/// This function modifies the start and end positions of each range in the input BED file based on +/// the provided offsets, ensuring the adjusted ranges do not exceed the sequence lengths specified +/// in the `seqlens` file. If sorting is enabled, the output will be sorted based on the sequence names +/// and start positions. +/// +/// # Arguments +/// +/// * `bedfile` - A reference to a `PathBuf` for the input BED file. +/// * `seqlens` - A reference to a `PathBuf` for the file containing sequence lengths. +/// * `both` - A [`PositionOffset`] specifying how much to adjust the start and end positions. +/// * `output` - An optional reference to a `PathBuf` where the adjusted ranges will be written. Writes +/// to stdout if `None`. +/// * `sort` - A boolean indicating whether to sort the output. +/// +/// # Returns +/// +/// A `Result` wrapping [`CommandOutput<()>`] on success, or [`GRangesError`] on failure. +/// +/// # Errors +/// +/// Returns `GRangesError` if the input BED file or sequence lengths file cannot be read, or if +/// an adjusted range exceeds the sequence boundaries. pub fn granges_adjust( bedfile: &PathBuf, seqlens: &PathBuf, @@ -27,8 +53,8 @@ pub fn granges_adjust( let genome = read_seqlens(seqlens)?; // Setup Output stream -- header is None for now (TODO). - let output_stream = output.map_or(OutputFile::new_stdout(None), |file| { - OutputFile::new(file, None) + let output_stream = output.map_or(OutputStream::new_stdout(None), |file| { + OutputStream::new(file, None) }); let mut writer = output_stream.writer()?; @@ -76,6 +102,9 @@ pub fn granges_adjust( let gr = GRangesEmpty::from_iter(iter, &genome)?; gr.adjust_ranges(-both, both).to_tsv(output)? } + GenomicRangesParser::Bed5(iter) => { + unimplemented!() + } GenomicRangesParser::Bedlike(iter) => { // Note the call to try_unwrap_data() here: this is because // we know that the records *do* have data. Unwrapping the Option @@ -92,7 +121,28 @@ pub fn granges_adjust( Ok(CommandOutput::new((), report)) } -/// Retain only the ranges that have at least one overlap with another set of ranges. +/// Filters genomic ranges based on overlaps with another set of ranges. +/// +/// Retains only the ranges from the `left_path` file that have at least one overlap with +/// the ranges in the `right_path` file. The function can optionally skip ranges that do not +/// exist in the provided sequence lengths (e.g. a "genome" file in `bedtools` lingo). +/// +/// # Arguments +/// +/// * `seqlens` - A reference to a `PathBuf` for the file containing sequence lengths. +/// * `left_path` - A reference to a `PathBuf` for the input BED file containing the ranges to filter. +/// * `right_path` - A reference to a `PathBuf` for the BED file containing ranges to check for overlaps. +/// * `output` - An optional reference to a `PathBuf` where the filtered ranges will be written. Writes +/// to stdout if `None`. +/// * `skip_missing` - A boolean indicating whether to skip ranges missing in the sequence lengths file. +/// +/// # Returns +/// +/// A `Result` wrapping [`CommandOutput<()>`] on success, or [`GRangesError`] on failure. +/// +/// # Errors +/// +/// Returns [`GRangesError`] if any input file cannot be read, or if there's an issue processing the ranges. pub fn granges_filter( seqlens: &PathBuf, left_path: &PathBuf, @@ -198,7 +248,30 @@ pub fn granges_filter( } } -/// Adjust the genomic ranges in a bedfile by some specified amount. +/// Generates flanking regions for genomic ranges in a BED file. +/// +/// For each range in the input BED file, this function computes the flanking regions based on +/// the specified left and right offsets. The flanking regions are constrained by the sequence lengths +/// provided in the `seqlens` file. The function supports both in-memory and streaming modes for processing. +/// +/// # Arguments +/// +/// * `seqlens` - A reference to a `PathBuf` for the file containing sequence lengths. +/// * `bedfile` - A reference to a `PathBuf` for the input BED file. +/// * `left` - An optional `Position` specifying the left flank size. +/// * `right` - An optional `Position` specifying the right flank size. +/// * `output` - An optional reference to a `PathBuf` for the output file. Writes to stdout if `None`. +/// * `skip_missing` - A boolean indicating whether to skip ranges missing in the sequence lengths file. +/// * `mode` - A [`ProcessingMode`] indicating whether to use in-memory or streaming processing. +/// +/// # Returns +/// +/// A `Result` wrapping [`CommandOutput<()>`] on success, or [`GRangesError`] on failure. +/// +/// # Errors +/// +/// Returns [`GRangesError`] if the input BED file or sequence lengths file cannot be read, or if there's +/// an issue generating the flanking regions. pub fn granges_flank( seqlens: &PathBuf, bedfile: &PathBuf, @@ -226,6 +299,9 @@ pub fn granges_flank( }; gr.flanking_ranges(left, right)?.to_tsv(output)? } + GenomicRangesParser::Bed5(_iter) => { + unimplemented!() + } GenomicRangesParser::Bedlike(iter) => { let gr = if skip_missing { GRanges::from_iter(iter.try_unwrap_data().retain_seqnames(&seqnames), &genome)? @@ -240,8 +316,8 @@ pub fn granges_flank( }, ProcessingMode::Streaming => { // Setup Output stream -- header is None for now (TODO). - let output_stream = output.map_or(OutputFile::new_stdout(None), |file| { - OutputFile::new(file, None) + let output_stream = output.map_or(OutputStream::new_stdout(None), |file| { + OutputStream::new(file, None) }); let mut writer = output_stream.writer()?; @@ -278,6 +354,9 @@ pub fn granges_flank( } } } + GenomicRangesParser::Bed5(_iter) => { + unimplemented!() + } GenomicRangesParser::Bedlike(iter) => { if skip_missing { for record in iter.retain_seqnames(&seqnames) { @@ -318,114 +397,112 @@ pub fn granges_flank( Ok(CommandOutput::new((), report)) } - pub fn granges_map( seqlens: impl Into, left_path: &PathBuf, right_path: &PathBuf, output: Option<&PathBuf>, - skip_missing: bool) - -> Result, GRangesError> { - let genome = read_seqlens(seqlens)?; - let seqnames: Vec = genome.keys().cloned().collect(); - let mut report = Report::new(); + skip_missing: bool, +) -> Result, GRangesError> { + let genome = read_seqlens(seqlens)?; + let seqnames: Vec = genome.keys().cloned().collect(); + let mut report = Report::new(); - let left_iter = GenomicRangesFile::parsing_iterator(left_path)?; - let right_iter = GenomicRangesFile::parsing_iterator(right_path)?; + let left_iter = GenomicRangesFile::parsing_iterator(left_path)?; + let right_iter = GenomicRangesFile::parsing_iterator(right_path)?; + match (left_iter, right_iter) { + (GenomicRangesParser::Bed5(left), GenomicRangesParser::Bed5(right)) => { + let left_gr; + let right_gr; - match (left_iter, right_iter) { - (GenomicRangesParser::Bed5(left), GenomicRangesParser::Bed5(right)) => { - let left_gr; - let right_gr; + if skip_missing { + left_gr = GRanges::from_iter(left.retain_seqnames(&seqnames), &genome)?; + right_gr = GRanges::from_iter(right.retain_seqnames(&seqnames), &genome)?; + } else { + left_gr = GRanges::from_iter(left, &genome)?; + right_gr = GRanges::from_iter(right, &genome)?; + } - if skip_missing { - left_gr = GRanges::from_iter(left.retain_seqnames(&seqnames), &genome)?; - right_gr = GRanges::from_iter(right.retain_seqnames(&seqnames), &genome)?; - } else { - left_gr = GRanges::from_iter(left, &genome)?; - right_gr = GRanges::from_iter(right, &genome)?; - } + let right_gr = right_gr.into_coitrees()?; - let right_gr = right_gr.into_coitrees()?; + let left_join = left_gr.left_overlaps(&right_gr)?; - let left_join = left_gr.left_overlaps(&right_gr)?; + // TODO -- map function + // left_join.to_tsv(output)?; - left_join.to_tsv(output)?; + Ok(CommandOutput::new((), report)) + } + (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => { + todo!(); + let left_gr; + let right_gr; - Ok(CommandOutput::new((), report)) + if skip_missing { + left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?; + right_gr = GRanges::from_iter( + right.try_unwrap_data().retain_seqnames(&seqnames), + &genome, + )?; + } else { + left_gr = GRangesEmpty::from_iter(left, &genome)?; + right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; } - (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => { - todo!(); - let left_gr; - let right_gr; - - if skip_missing { - left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?; - right_gr = GRanges::from_iter( - right.try_unwrap_data().retain_seqnames(&seqnames), - &genome, - )?; - } else { - left_gr = GRangesEmpty::from_iter(left, &genome)?; - right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; - } - let right_gr = right_gr.into_coitrees()?; + let right_gr = right_gr.into_coitrees()?; - let intersection = left_gr.filter_overlaps(&right_gr)?; - intersection.to_tsv(output)?; + let intersection = left_gr.filter_overlaps(&right_gr)?; + intersection.to_tsv(output)?; - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), report)) + } + (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => { + let left_gr; + let right_gr; + + if skip_missing { + left_gr = + GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?; + right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?; + } else { + left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; + right_gr = GRangesEmpty::from_iter(right, &genome)?; } - (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => { - let left_gr; - let right_gr; - - if skip_missing { - left_gr = - GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?; - right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?; - } else { - left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; - right_gr = GRangesEmpty::from_iter(right, &genome)?; - } - let right_gr = right_gr.into_coitrees()?; + let right_gr = right_gr.into_coitrees()?; + + let intersection = left_gr.filter_overlaps(&right_gr)?; + intersection.to_tsv(output)?; - let intersection = left_gr.filter_overlaps(&right_gr)?; - intersection.to_tsv(output)?; + Ok(CommandOutput::new((), report)) + } + (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => { + todo!(); + let left_gr; + let right_gr; - Ok(CommandOutput::new((), report)) + if skip_missing { + left_gr = + GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?; + right_gr = GRanges::from_iter( + right.try_unwrap_data().retain_seqnames(&seqnames), + &genome, + )?; + } else { + left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; + right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; } - (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => { - todo!(); - let left_gr; - let right_gr; - - if skip_missing { - left_gr = - GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?; - right_gr = GRanges::from_iter( - right.try_unwrap_data().retain_seqnames(&seqnames), - &genome, - )?; - } else { - left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; - right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; - } - let right_gr = right_gr.into_coitrees()?; + let right_gr = right_gr.into_coitrees()?; - let intersection = left_gr.filter_overlaps(&right_gr)?; - intersection.to_tsv(output)?; + let intersection = left_gr.filter_overlaps(&right_gr)?; + intersection.to_tsv(output)?; - Ok(CommandOutput::new((), report)) - } - _ => Err(GRangesError::UnsupportedGenomicRangesFileFormat), + Ok(CommandOutput::new((), report)) } + _ => Err(GRangesError::UnsupportedGenomicRangesFileFormat), } - +} /// Generate a random BED-like file with genomic ranges. pub fn granges_random_bed( @@ -433,7 +510,7 @@ pub fn granges_random_bed( num: usize, output: Option>, sort: bool, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; diff --git a/src/data/mod.rs b/src/data/mod.rs index 3c031c1..1cea0b5 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -1,4 +1,5 @@ //! Data container implementations. +//! use crate::traits::DataContainer; diff --git a/src/error.rs b/src/error.rs index da84761..0f15f26 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,10 +1,12 @@ -use std::num::{ParseFloatError, ParseIntError}; - +//! The [`GRangesError`] `enum` definition and error messages. +//! +use crate::Position; use genomap::GenomeMapError; +use std::num::{ParseFloatError, ParseIntError}; use thiserror::Error; -use crate::Position; - +/// The [`GRangesError`] defines the standard set of errors that should +/// be passed to the user. #[derive(Debug, Error)] pub enum GRangesError { // IO related errors diff --git a/src/granges.rs b/src/granges.rs index 83f921f..ed191d4 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -1,33 +1,50 @@ -//! # Design +//! The [`GRanges`] and [`GRangesEmpty`] types, and associated functionality. //! //! +//! +----------------------+ +//! | GRanges | +//! | object | +//! +----------------------+ +//! /\ +//! / \ +//! / \ +//! v v +//! +-----------+ +-------------+ +//! | Ranges | | Data | +//! | container | | container | +//! | (R) | | (T) | +//! +-----------+ +-------------+ +//! //! # [`GRanges`] Generic Types //! //! [`GRanges`] types are generic over: //! -//! 1. Their **range container** (`R`). This is because different operations to be fast, and thus -//! need different data structures. The [`GRanges`] methods allow for efficient conversion of -//! one range type to another. +//! 1. Their **range container** (`R`). This is because for some range operations to be fast, they +//! need to be converted to different data structures. The [`GRanges`] methods allow for efficient +//! conversion of one range type to another (e.g. [`GRanges::into_coitrees()`]. //! //! 2. The *optional* **data container** (`T`). The data container exists if the ranges have some //! associated data. When range *do* have data, there is an index that associates each range //! with its data element in the data container. //! -//! This brings up the most important thing to know about working with the GRanges library: because -//! of the emphasis on on knowing all types at compile-time (which has performance benefits over -//! runtime type polymorphism), it must be known at compile-time whether ranges have data or not. +//! Because GRanges is designed to be a compile-time library (which has performance benefits over +//! runtime type polymorphism), it must be known at compile-time whether a [`GRanges`] container +//! will have associated data or not. The special [`GRangesEmpty`] type represents (and wraps) +//! [`GRanges`] objects that do not have data. +//! //! -//! In most applications this is known: a user specifies, for example, a GTF file and the contents -//! are parsed and processed accordingly. However, it's not unfeasible to imagine that one would -//! need runtime "polymorphism" over BED3 input (which would lead to a [`GRanges`] object without -//! data) and BED* (e.g. BED5, BED12, etc) input. (For example, the GRanges command line tool -//! `granges` runs into this problem — see it's implementation for examples.) //! -//! These two possibilities are handled with two differently typed parsing iterators: -//! [`Bed3Iterator`] and [`BedlikeIterator`]. These yield different parsed range types, -//! [`GenomicRangeEmptyRecord`] and [`GenomicRangeRecord`], respectively. //! +//! TODO //! +//! **High-level data types**: A key feature of GRanges design is that a single type of ranges are +//! contained in the range containers. By knowing that every range in a range container either has +//! an index to a data element in the data container or it does not ahead of time simplifies +//! downstream ergonomics tremendously. +//! +//! **Emphasis on compile-time**: For this, let's consider a common problem: a bioinformatics tool +//! needs to read in a BED-like file that has a variable, unknown at compile time, number of +//! columns. //! //! This is an important concept when working with [`GRanges`] types: //! @@ -55,6 +72,10 @@ //! //! Because at compile-time, the types *need* to be known, there are a few options here. //! +//! +//! [`Bed3Iterator`]: crate::io::parsers::Bed3Iterator +//! [`BedlikeIterator`]: crate::io::parsers::BedlikeIterator +//! [`GRanges::into_coitrees`]: crate::granges::GRanges::into_coitrees use std::{collections::HashSet, path::PathBuf}; @@ -64,7 +85,7 @@ use indexmap::IndexMap; use crate::{ ensure_eq, - io::OutputFile, + io::OutputStream, iterators::GRangesIterator, join::{JoinData, LeftGroupedJoin}, prelude::GRangesError, @@ -143,7 +164,7 @@ impl<'a, C> AsGRangesRef<'a, C, ()> for GRangesEmpty { impl<'a, C, T> AsGRangesRef<'a, C, T> for GRanges { /// Return a reference of a [`GRanges`] object. This is essentially - /// a pass-through method. [`IntoGRangesRef`] is not needed in this case, + /// a pass-through method. [`AsGRangesRef`] is not needed in this case, /// but is needed elsewhere (see the implementation for [`GRangesEmpty`]) to /// improve the ergonomics of working with [`GRanges`] and [`GRangesEmpty`] types. fn as_granges_ref(&'a self) -> &'a GRanges { @@ -195,8 +216,8 @@ where /// Write fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { // output stream -- header is None for now (TODO) - let output = output.map_or(OutputFile::new_stdout(None), |file| { - OutputFile::new(file, None) + let output = output.map_or(OutputStream::new_stdout(None), |file| { + OutputStream::new(file, None) }); let mut writer = output.writer()?; @@ -214,8 +235,8 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges /// Output a BED3 file for for this data-less [`GRanges`]. fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { // output stream -- header is None for now (TODO) - let output = output.map_or(OutputFile::new_stdout(None), |file| { - OutputFile::new(file, None) + let output = output.map_or(OutputStream::new_stdout(None), |file| { + OutputStream::new(file, None) }); let mut writer = output.writer()?; @@ -505,7 +526,6 @@ impl<'a, DL, DR> GRanges> { data: LeftGroupedJoin, ) -> Result<(), GRangesError> { if self.data.is_none() { - // unlike push_range() panic!("Internal error: JoinData not initialized."); } let data_ref = self.data.as_mut().ok_or(GRangesError::NoDataContainer)?; @@ -535,8 +555,8 @@ where /// [`TsvSerialize`] trait methods defiend for the items in `T`. pub fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { // output stream -- header is None for now (TODO) - let output = output.map_or(OutputFile::new_stdout(None), |file| { - OutputFile::new(file, None) + let output = output.map_or(OutputStream::new_stdout(None), |file| { + OutputStream::new(file, None) }); let mut writer = output.writer()?; @@ -550,10 +570,10 @@ where } /// Conduct a left overlap join, consuming self and returning a new - /// [`GRanges>`]. + /// [`GRanges`]. /// /// The [`JoinData`] container contains references to both left and right - /// data containers and a [`Vec`]. Each [`OverlapJoin`] represents + /// data containers and a [`Vec`]. Each [`LeftGroupedJoin`] represents /// a summary of an overlap, which downstream operations use to calculate /// statistics using the information about overlaps. pub fn left_overlaps( @@ -646,10 +666,10 @@ impl GRanges, T> { impl<'a> GRangesEmpty { /// Conduct a left overlap join, consuming self and returning a new - /// [`GRanges>`]. + /// [`GRanges`]. /// /// The [`JoinData`] container contains references to both left and right - /// data containers and a [`Vec`]. Each [`OverlapJoin`] represents + /// data containers and a [`Vec`]. Each [`LeftGroupedJoin`] represents /// a summary of an overlap, which downstream operations use to calculate /// statistics using the information about overlaps. pub fn left_overlaps( @@ -809,7 +829,7 @@ where /// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. /// /// Note that this consumes the `self` [`GRanges`] object, turning it into a new - /// [`GRanges`]. The data container is rebuilt from indices + /// [`GRanges>`]. The data container is rebuilt from indices /// into a new [`Vec`] where `U` is the associated type [`IndexedDataContainer::Item`], /// which represents the individual data element in the data container. pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( diff --git a/src/io/file.rs b/src/io/file.rs index 7fff865..f65c2ea 100644 --- a/src/io/file.rs +++ b/src/io/file.rs @@ -1,4 +1,4 @@ -//! Input/Output file handling with [`InputFile`] and [`OutputFile`]. +//! Input/Output file handling with [`InputStream`] and [`OutputStream`]. //! //! These types abstract over reading/writing both plaintext and gzip-compressed //! input/output. @@ -20,7 +20,7 @@ use crate::Position; pub fn read_seqlens( filepath: impl Into, ) -> Result, GRangesError> { - let input_file = InputFile::new(filepath); + let input_file = InputStream::new(filepath); let reader = input_file.reader()?; let mut seqlens = IndexMap::new(); @@ -55,20 +55,20 @@ fn is_gzipped_file(file_path: impl Into) -> io::Result { /// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input /// to be read through a common interface. #[derive(Clone, Debug)] -pub struct InputFile { +pub struct InputStream { pub filepath: PathBuf, pub comments: Option>, pub header: Option, pub skip_lines: usize, } -impl InputFile { - /// Constructs a new `InputFile`. +impl InputStream { + /// Constructs a new `InputStream`. /// /// # Arguments /// /// * `filepath` - A string slice that holds the path to the file. If the file extension is - /// `.gz`, `InputFile` will automatically uncompress the input. + /// `.gz`, `InputStream` will automatically uncompress the input. pub fn new(filepath: impl Into) -> Self { Self { filepath: filepath.into(), @@ -167,18 +167,18 @@ enum OutputDestination { /// /// This struct is used to handle operations on an output file, such as writing to the file. /// This abstracts writing both plaintext and gzip-compressed files. -pub struct OutputFile { +pub struct OutputStream { destination: OutputDestination, pub header: Option>, } -impl OutputFile { - /// Constructs a new `OutputFile`. +impl OutputStream { + /// Constructs a new `OutputStream`. /// /// # Arguments /// /// * `filepath` - A string slice that holds the path to the file. If the file extension is - /// `.gz`, `OutputFile` will automatically write gzip-compressed output. + /// `.gz`, `OutputStream` will automatically write gzip-compressed output. /// * `header` - An optional vector of strings representing commented header lines to be written to the file. pub fn new(filepath: impl Into, header: Option>) -> Self { Self { @@ -187,7 +187,7 @@ impl OutputFile { } } - /// Constructs a new [`OutputFile`] for standard output. + /// Constructs a new [`OutputStream`] for standard output. pub fn new_stdout(header: Option>) -> Self { Self { destination: OutputDestination::Stdout, diff --git a/src/io/mod.rs b/src/io/mod.rs index 2b377c4..43df647 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -1,11 +1,9 @@ -//! Input/Output -//! +//! Types and methods for reading and parsing input and writing output. pub mod file; -pub mod noodles; pub mod parsers; -pub use file::{InputFile, OutputFile}; +pub use file::{InputStream, OutputStream}; pub use parsers::{ Bed3Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, TsvRecordIterator, }; diff --git a/src/io/noodles.rs b/src/io/noodles.rs deleted file mode 100644 index 9e70cdb..0000000 --- a/src/io/noodles.rs +++ /dev/null @@ -1,16 +0,0 @@ -//! - -use crate::Position; -use noodles::core::Position as NoodlesPosition; - -/// Convert from [`noodles::core::Position`], which is [1-based -/// indexing](https://docs.rs/noodles-core/latest/noodles_core/position/struct.Position.html) -/// and right-to GRanges internal 0-based indexing. -/// -/// # Developers Note -/// See: https://github.com/zaeleus/noodles/issues/226 -pub fn convert_noodles_range(start: NoodlesPosition, end: NoodlesPosition) -> (Position, Position) { - let start_one_based: Position = start.get().try_into().unwrap(); - let end_one_based: Position = end.get().try_into().unwrap(); - (start_one_based - 1, end_one_based) -} diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 3633fc0..263d11b 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -1,49 +1,80 @@ -//! Functionality for general parsing, by turning BED-like files into iterators. +//! Various types of parsing iterators for range formats. //! -//! ## Parsers +//! To work with genomic data in GRanges, one first needs to read it off disk and parse it. This is +//! done with *parsing iterators*. These iterators work on both plaintext and gzip-compressed +//! files (and bgzip and binary format support will hopefully be added). Each row of a file is +//! yielded as a particular parsed range type, which then can be filtered or altered in while +//! in the iterator, using Rust's powerful [`Iterator`] trait methods. //! -//! Parsers in the GRanges library are designed to be: +//! ## Parsing Iterator Design //! -//! 1. *Iterator-based*: GRanges parsers are iterators, allow entries to be filtered or +//! The key design elements of parsing iterators are: +//! +//! 1. *Iterators*: GRanges parsers are iterators, allow entries to be filtered or //! manipulated on the fly using Rust's iterator methods like [`Iterator.filter`]. +//! Additionally, the trait [`GeneralRangeRecordIterator`] adds additional convenience methods +//! for commmon genomic tasks, like filtering based on chromosome name. //! -//! 2. *Lazy*: GRanges parsers are lazy. Since all BED file formats are a superset of BED3, we can -//! parse the range components out of the first three columns and then just store the remaining -//! unparsed part of the line in a `String`. +//! 2. *Lazy*: GRanges parsers are build on lazy parsers. Since all BED file formats are a +//! superset of BED3, we can parse the range components out of the first three columns and then +//! just store the remaining unparsed part of the line in a `String`. Other formats like GTF +//! are similar to BED: we can parse the range components of the file first, and parse the +//! remaining data later *if needed*. //! //! 3. *Permissive*: All GRanges parsers are built off of the [`TsvRecordIterator`], which reads -//! plaintext and gzip-compressed files and parses them according to a specified function. This -//! general parser should be able to accomodate every line-based range bioinformatics format +//! plaintext and gzip-compressed files and parses the record according to a specified function. +//! This general parser should be able to accomodate every line-based range bioinformatics format //! (BED3, BED5, GTF, GFF, VCF, etc). Downstream users can implement their own specific parsers //! for these variants (please feel free to contribute a parser for a commonly-used variant to //! GRanges too!). //! -//! Since GRanges is fundamentally about manipulating genomic range data, all parsers output to the -//! same record type: [`GenomicRangeRecord>`], which is generic over the data. The -//! [`Option`] here is because the lazy parser *at compile time* does not know how many -//! columns it will encounter. If only three columns are encountered (e.g. a BED3 file), the data -//! in this [`GenomicRangeRecord`] are all `None`. Then, the ranges that go into [`GRanges`] object -//! do not have indices, since there is not data container. +//! ## Parsing Iterator Item Types +//! +//! These are the generic types that the iterator yields for each call of [`Iterator::next()`], +//! which represent a single row of data. +//! +//! There are two key item types that flow through parsing iterators, each representing a row of a +//! different kind of data file. +//! +//! 1. [`GenomicRangeEmptyRecord`], when a BED3 is loaded. This type indicates the incoming ranges +//! *do not* have associated data that would go into a data container. It is *empty* of data. +//! +//! 2. [`GenomicRangeRecord>`], which is a range record *possibly* (hence, +//! the [`Option`]) with remaining *unparsed* `String` data (e.g. the remaining unparsed +//! columns of a VCF file). //! -//! Otherwise, if there *is* data, this data needs to be pushed to the data container, and the -//! indexed range type ([`RangeIndexed`]) is used in the range containers. +//! Most genomic file formats can be thought of as a genomic range with some associated data, +//! for example: //! -//! While handling of this at compile time leads to very performant code with lower memory -//! overhead, it has the downside that *we must handle both types at compile time*. +//! - VCF files will have information on the alleles and a vector of genotypes. //! +//! - GTF/GFF files store details about the annotated feature (e.g. name, protein ID, etc). //! -//! # Working Downstream of Parsers +//! Thus, most parsing iterators will yield this second type, [`GenomicRangeRecord>`]. //! -//! Often at runtime the exact file format may not be known. A user could specify a BED3 file, -//! which only contains ranges and no data, or a BED* of BED-like file (see terminology below). -//! Since GRanges aims to handle most situations *at compile time*, it must handle the process of -//! figuring out how to take an iterator of [`GenomicRangeRecord`] entries and +//! However, often empty ranges (those that do not carry any data) are useful: for example, ranges +//! that just define genomic windows at regular widths, or masked repeat ranges. In this case, when +//! data is loaded into a [`GRanges`] type, the data container type `T` is `()`. This is a +//! degenerate case that GRanges handles with a specific type called [`GRangesEmpty`], which indicates +//! the ranges in the ranges container in [`GRangesEmpty`] do not have any associated data. //! +//! In most cases, people working with genomic file formats will know its type at compile-time. +//! However, if this isn't the case, see [⚙️ Handling Range Data when the type isn't know at runtime] below. //! +//! # ⚙️ Handling Range Data when the type isn't know at runtime //! -//! ## Terminology +//! Because GRanges is a compile-time library, the type of incoming data is not runtime-polymorphic. +//! GRanges implements file detection methods to infer the filetype from the extension and +//! does additional type validation checks. When you want to use GRanges to handle a type +//! not known at compile-time, you will need to use `match` statements against the parsing +//! iterator types defined in [`GenomicRangesParser`]. The best examples of this are the +//! `granges` subcommands implementations. //! -//! - BED3 - BED* - BED-like +//! ## File Format Terminology +//! +//! - BED3 +//! - BED* +//! - BED-like //! //! //! : it could be ranges-only (i.e. a BED3), or contain data (e.g. BED5). The lazy BED parser will @@ -61,13 +92,17 @@ //! //! # BED-like File Parser Design //! -//! All BED formats (BED3, BED5, etc). are built upon a BED3. Often when working with these types +//! All BED formats (BED3, BED5, etc) are built upon a BED3. Often when working with these types //! of formats, many operations do not immediately require full parsing of the line, past the //! *range components*. This is because downstream operations may immediately filter away entries //! (e.g. based on width), or do an overlap operation, and then filter away entries based on some //! overlap criteria. Either way, it may be advantageous to work with ranges that just store some //! generic data. //! +//! [`GRanges`]: crate::granges::GRanges +//! [`GRanges`]: crate::granges::GRanges +//! [`GRangesEmpty`]: crate::granges::GRangesEmpty +//! use std::collections::HashSet; use std::io::{BufRead, BufReader}; @@ -75,7 +110,7 @@ use std::marker::PhantomData; use std::path::{Path, PathBuf}; use crate::error::GRangesError; -use crate::io::file::InputFile; +use crate::io::file::InputStream; use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, TsvSerialize}; use crate::Position; @@ -116,7 +151,7 @@ fn get_base_extension>(filepath: P) -> Option { /// file, i.e. the first column is there (there are no reasonable checks /// for sequence names other than presence), and the next to columns can /// be parsed into a [`Position`]. -fn valid_bedlike(input_file: &mut InputFile) -> Result { +fn valid_bedlike(input_file: &mut InputStream) -> Result { let _metadata = input_file.collect_metadata("#", None)?; let mut reader = input_file.continue_reading()?; let mut first_line = String::new(); @@ -144,7 +179,7 @@ fn valid_bedlike(input_file: &mut InputFile) -> Result { } // TODO: we can combine this and the above into an enum -fn valid_bed5(input_file: &mut InputFile) -> Result { +fn valid_bed5(input_file: &mut InputStream) -> Result { let _metadata = input_file.collect_metadata("#", None)?; let mut reader = input_file.continue_reading()?; let mut first_line = String::new(); @@ -167,7 +202,6 @@ fn valid_bed5(input_file: &mut InputFile) -> Result { Ok(false) } - /// Enum that connects a genomic ranges file type to its specific parser. #[derive(Debug)] pub enum GenomicRangesParser { @@ -209,15 +243,17 @@ impl GenomicRangesFile { /// columns are BED3) will have the type [`GenomicRangesFile::Bed3`]. This /// is because downstream [`GRanges`] operations need to know if any /// additional data is present, which would need to be put in a data container. - /// 4. If the file type does not satisfy any of the rules above, it is + /// 4. BED5 files, which are BED3 + a *feature name* and a *strand* column. + /// 5. If the file type does not satisfy any of the rules above, it is /// [`GenomicRangesFile::Unsupported`]. /// /// See the `match` statement in the source code for the exact rules. Additional /// genomic range file formats like GTF/GFF can easily be added later. /// + /// [`GRanges`]: crate::granges::GRanges pub fn detect(filepath: impl Into) -> Result { let path: PathBuf = filepath.into(); - let mut input_file = InputFile::new(&path); + let mut input_file = InputStream::new(&path); let _metadata = input_file.collect_metadata("#", None)?; let number_columns = input_file.detect_columns("\t")?; @@ -228,11 +264,16 @@ impl GenomicRangesFile { // test if the first row can be parsed into a BED-like file format. let is_valid_bedlike = valid_bedlike(&mut input_file)?; let is_valid_bed5 = valid_bed5(&mut input_file)?; // TODO OPTIMIZE merge with above - let file_type = match (extension.as_str(), number_columns, is_valid_bedlike, is_valid_bed5) { + let file_type = match ( + extension.as_str(), + number_columns, + is_valid_bedlike, + is_valid_bed5, + ) { ("bed", 3, true, false) => GenomicRangesFile::Bed3(path), ("tsv", 3, true, false) => GenomicRangesFile::Bed3(path), ("bed", 5, true, true) => GenomicRangesFile::Bed5(path), - ("bed", n, true, false,) if n > 3 => GenomicRangesFile::Bedlike(path), + ("bed", n, true, false) if n > 3 => GenomicRangesFile::Bedlike(path), ("tsv", n, true, false) if n > 3 => GenomicRangesFile::Bedlike(path), _ => GenomicRangesFile::Unsupported, }; @@ -242,11 +283,11 @@ impl GenomicRangesFile { /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error /// if the filetype is not supported. /// - /// This returns a [`GenomicRangesParsers`] enum, since the parsing iterator filetype + /// This returns a [`GenomicRangesParser`] enum, since the parsing iterator filetype /// cannot be known at compile time. pub fn parsing_iterator( filepath: impl Clone + Into, - ) -> Result { + ) -> Result { let path = filepath.into(); match Self::detect(path)? { GenomicRangesFile::Bed3(path) => { @@ -264,7 +305,7 @@ impl GenomicRangesFile { } /// An extensible TSV parser, which uses a supplied parser function to -/// convert a line into a [`RangeRecord`], a range with generic associated +/// convert a line into a [`GenomicRangeRecord`], a range with generic associated /// data. pub struct TsvRecordIterator { reader: BufReader>, @@ -281,12 +322,12 @@ impl std::fmt::Debug for TsvRecordIterator { impl TsvRecordIterator where -F: Fn(&str) -> Result, + F: Fn(&str) -> Result, { /// Create a new [`TsvRecordIterator`], which parses lines from the supplied - /// file path into [`RangeRecord`] using the specified parsing function. + /// file path into [`GenomicRangeRecord`] using the specified parsing function. pub fn new(filepath: impl Into, parser: F) -> Result { - let mut input_file = InputFile::new(filepath); + let mut input_file = InputStream::new(filepath); let _has_metadata = input_file.collect_metadata("#", None); let num_columns = input_file.detect_columns("\t")?; let reader = input_file.continue_reading()?; @@ -302,7 +343,7 @@ F: Fn(&str) -> Result, impl Iterator for TsvRecordIterator where -F: Fn(&str) -> Result, + F: Fn(&str) -> Result, { type Item = Result; @@ -324,18 +365,15 @@ pub enum GenomicRangesIteratorVariant { Empty(Box, GRangesError>>>), } -/// A lazy parser for BED3 (ranges only) files. This parses the first three columns. -/// yields [`GenomicRangeRecord<>>`] entries. If the file is a BED3 file, -/// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining -/// string columns to parse. -/// +/// A lazy parser for BED3 (ranges only) files. This parses the first three columns, +/// yielding a [`GenomicRangeRecord`] items. #[allow(clippy::type_complexity)] #[derive(Debug)] pub struct Bed3Iterator { iter: TsvRecordIterator< fn(&str) -> Result, GenomicRangeEmptyRecord, - >, + >, } impl Bed3Iterator { @@ -359,7 +397,6 @@ impl Iterator for Bed3Iterator { } } - /// Nucleotide strand enum type. #[derive(Clone, Debug)] pub enum Strand { @@ -384,6 +421,12 @@ pub struct Bed5Addition { strand: Option, } +impl TsvSerialize for &Bed5Addition { + fn to_tsv(&self) -> String { + format!("{}\t{}", self.name, self.strand.to_tsv()) + } +} + impl TsvSerialize for Bed5Addition { fn to_tsv(&self) -> String { format!("{}\t{}", self.name, self.strand.to_tsv()) @@ -400,7 +443,7 @@ pub struct Bed5Iterator { iter: TsvRecordIterator< fn(&str) -> Result, GRangesError>, GenomicRangeRecord, - >, + >, } impl Bed5Iterator { @@ -424,7 +467,6 @@ impl Iterator for Bed5Iterator { } } - /// A lazy parser for BED-like files. This lazily parses only the first three columns, and /// yields [`GenomicRangeRecord>`] entries. If the file is a BED3 file, /// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining @@ -436,7 +478,7 @@ pub struct BedlikeIterator { iter: TsvRecordIterator< fn(&str) -> Result>, GRangesError>, GenomicRangeRecord>, - >, + >, } impl BedlikeIterator { @@ -472,13 +514,13 @@ impl BedlikeIterator { result .map(|record| { Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - (), - )) + record.seqname, + record.start, + record.end, + (), + )) }) - .unwrap_or_else(Err) // pass through parsing errors + .unwrap_or_else(Err) // pass through parsing errors }) } } @@ -491,7 +533,7 @@ impl Iterator for BedlikeIterator { } } -/// An iterator over [`IntervalRecord`] items that filters based on sequence name. +/// An iterator over a generic "genomic range like " item type `R`, that filters based on sequence name. /// /// Note that that the exclude filter is prioritized over the retain filter. So, /// if a pipeline contains both, then if a chromosome is supplied to @@ -522,7 +564,7 @@ impl Iterator for BedlikeIterator { #[derive(Debug)] pub struct FilteredRanges where -I: Iterator>, + I: Iterator>, { inner: I, retain_seqnames: Option>, @@ -531,13 +573,13 @@ I: Iterator>, impl FilteredRanges where -I: Iterator>, + I: Iterator>, { pub fn new( inner: I, retain_seqnames: Option<&Vec>, exclude_seqnames: Option<&Vec>, - ) -> Self { + ) -> Self { let retain_seqnames = retain_seqnames.cloned().map(HashSet::from_iter); let exclude_seqnames = exclude_seqnames.cloned().map(HashSet::from_iter); Self { @@ -551,7 +593,7 @@ I: Iterator>, /// Range-filtering iterator implementation for [`GenomicRangeRecord`]. impl Iterator for FilteredRanges> where -I: Iterator, GRangesError>>, + I: Iterator, GRangesError>>, { type Item = Result, GRangesError>; @@ -562,18 +604,18 @@ I: Iterator, GRangesError>>, Ok(entry) => { if self .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } if self .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } } Err(_) => return Some(item), } @@ -585,7 +627,7 @@ I: Iterator, GRangesError>>, /// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`]. impl Iterator for FilteredRanges where -I: Iterator>, + I: Iterator>, { type Item = Result; @@ -596,18 +638,18 @@ I: Iterator>, Ok(entry) => { if self .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } if self .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } } Err(_) => return Some(item), } @@ -620,13 +662,13 @@ impl GeneralRangeRecordIterator>> for BedlikeI fn retain_seqnames( self, seqnames: &[String], - ) -> FilteredRanges>> { + ) -> FilteredRanges>> { FilteredRanges::new(self, Some(&seqnames.to_vec()), None) } fn exclude_seqnames( self, seqnames: &[String], - ) -> FilteredRanges>> { + ) -> FilteredRanges>> { FilteredRanges::new(self, None, Some(&seqnames.to_vec())) } } @@ -638,39 +680,40 @@ impl GeneralRangeRecordIterator for Bed3Iterator { fn exclude_seqnames( self, seqnames: &[String], - ) -> FilteredRanges { + ) -> FilteredRanges { FilteredRanges::new(self, None, Some(&seqnames.to_vec())) } } impl GeneralRangeRecordIterator> for Bed5Iterator { - fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges> { + fn retain_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { FilteredRanges::new(self, Some(&seqnames.to_vec()), None) } fn exclude_seqnames( self, seqnames: &[String], - ) -> FilteredRanges> { + ) -> FilteredRanges> { FilteredRanges::new(self, None, Some(&seqnames.to_vec())) } } - - impl GeneralRangeRecordIterator> for UnwrappedRanges where -I: Iterator>, GRangesError>>, + I: Iterator>, GRangesError>>, { fn retain_seqnames( self, seqnames: &[String], - ) -> FilteredRanges> { + ) -> FilteredRanges> { FilteredRanges::new(self, Some(&seqnames.to_vec()), None) } fn exclude_seqnames( self, seqnames: &[String], - ) -> FilteredRanges> { + ) -> FilteredRanges> { FilteredRanges::new(self, None, Some(&seqnames.to_vec())) } } @@ -678,14 +721,14 @@ I: Iterator>, GRangesError>>, #[derive(Debug)] pub struct UnwrappedRanges where -I: Iterator>, GRangesError>>, + I: Iterator>, GRangesError>>, { inner: I, } impl UnwrappedRanges where -I: Iterator>, GRangesError>>, + I: Iterator>, GRangesError>>, { pub fn new(inner: I) -> Self { Self { inner } @@ -694,7 +737,7 @@ I: Iterator>, GRangesError>>, impl Iterator for UnwrappedRanges where -I: Iterator>, GRangesError>>, + I: Iterator>, GRangesError>>, { type Item = Result, GRangesError>; @@ -702,11 +745,11 @@ I: Iterator>, GRangesError>>, if let Some(result) = self.inner.next() { return Some(result.and_then(|record| match record.data { Some(data) => Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - data, - )), + record.seqname, + record.start, + record.end, + data, + )), None => Err(GRangesError::TryUnwrapDataError), })); } @@ -739,7 +782,7 @@ impl GenomicRangeRecordUnwrappable for BedlikeIterator { /// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`. pub fn parse_column(column: &str, line: &str) -> Result where -::Err: std::fmt::Debug, + ::Err: std::fmt::Debug, { // NOTE: this is used a lot, and should be benchmarked. column @@ -825,7 +868,6 @@ pub fn parse_bed3(line: &str) -> Result { }) } - fn parse_strand(symbol: char) -> Result, GRangesError> { match symbol { '+' => Ok(Some(Strand::Forward)), @@ -835,9 +877,8 @@ fn parse_strand(symbol: char) -> Result, GRangesError> { } } - /// Parses a BED5 format line into the three columns defining the range, and additional -/// columns +/// columns /// pub fn parse_bed5(line: &str) -> Result, GRangesError> { let columns: Vec<&str> = line.splitn(4, '\t').collect(); @@ -862,13 +903,11 @@ pub fn parse_bed5(line: &str) -> Result, GRange }) } - - #[cfg(test)] mod tests { use crate::io::{ parsers::{get_base_extension, valid_bedlike}, - InputFile, + InputStream, }; use super::GenomicRangesFile; @@ -891,27 +930,27 @@ mod tests { fn test_rangefiletype_detect() { let range_filetype = GenomicRangesFile::detect("tests_data/example.bed"); assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bed3(_) - )); + range_filetype.unwrap(), + GenomicRangesFile::Bed3(_) + )); let range_filetype = GenomicRangesFile::detect("tests_data/example_bedlike.tsv"); assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bedlike(_) - )); + range_filetype.unwrap(), + GenomicRangesFile::Bedlike(_) + )); } #[test] fn test_valid_bedlike() { assert_eq!( - valid_bedlike(&mut InputFile::new("tests_data/example.bed")).unwrap(), + valid_bedlike(&mut InputStream::new("tests_data/example.bed")).unwrap(), true - ); + ); assert_eq!( - valid_bedlike(&mut InputFile::new("tests_data/invalid_format.bed")).unwrap(), + valid_bedlike(&mut InputStream::new("tests_data/invalid_format.bed")).unwrap(), false - ); + ); } //#[test] diff --git a/src/iterators.rs b/src/iterators.rs index cb43a48..8e89373 100644 --- a/src/iterators.rs +++ b/src/iterators.rs @@ -1,3 +1,8 @@ +//! Iterators over genomic ranges and their data. +//! +//! Note that *parsing iterators* are in [`parsers`]. +//! +//! [`parsers`]: crate::io::parsers use genomap::GenomeMap; use crate::{ @@ -5,7 +10,7 @@ use crate::{ traits::{GenericRange, IterableRangeContainer, RangeContainer}, }; -/// An iterator yielding [`RangeIndexedRecord`], which store +/// An iterator yielding [`GenomicRangeIndexedRecord`], which store /// indices to the sequence names and data container. /// /// # Developer Notes diff --git a/src/join.rs b/src/join.rs index ed845c4..90260cd 100644 --- a/src/join.rs +++ b/src/join.rs @@ -1,3 +1,5 @@ +//! [`LeftGroupedJoin`], [`JoinData`], and [`JoinDataIterator`] types for overlaps. +//! #![allow(clippy::all)] use crate::{traits::GenericRange, Position}; @@ -45,7 +47,10 @@ impl LeftGroupedJoin { // right_data, } } - /// Add a right range to this [`LeftGroupedJoin`]. + /// Add a right (overlapping) range to this [`LeftGroupedJoin`]. + /// + // Note: in principle, this can be called on *non-overlapping* right ranges too, + // for a full-outer join. pub fn add_right(&mut self, left: &R, right: &Q) { self.rights.push(right.index()); self.right_lengths.push(right.width()); @@ -107,6 +112,11 @@ impl<'a, DL, DR> JoinData<'a, DL, DR> { } } +/// An iterator over the [`LeftGroupedJoin`] types that represent +/// information about overlaps right ranges have with a particular left range. +/// +/// This also contains references to the left and right data containers, for +/// better ergonomics in downstream data processing. pub struct JoinDataIterator<'a, DL, DR> { inner: std::slice::Iter<'a, LeftGroupedJoin>, pub left_data: Option<&'a DL>, diff --git a/src/lib.rs b/src/lib.rs index 82bd346..2d84af8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,102 @@ #![crate_name = "granges"] #![doc(html_root_url = "https://docs.rs/granges/")] +//! # GRanges: Generic Genomic Range and Data Containers +//! +//! GRanges is a Rust library for working with genomic ranges and their associated data. GRanges +//! also provides a separate command line tool built using this library, which currently +//! implements a subset of the features of other great bioinformatics utilities like +//! [bedtools](https://bedtools.readthedocs.io/en/latest/). +//! +//! The idea of GRanges is to facilitate building a new generation of robust, reproducible, and +//! easily installable bioinformatics tools in Rust. It aims to lower the barrier to building +//! simple bioinformatics data processing tools in Rust for biologists that are interested +//! in using the language in their work. Even biologists that have no interest in learning Rust +//! should benefit from Rust-based tools since they are fast. Currently GRanges is engineered +//! as a set of compile-time generic types and methods, though future versions may implement +//! analagous runtime dynamic data structures that would allow the library to be wrapped and used +//! by other languages. +//! +//! +//! ## Design Overview +//! +//! To understand how GRanges could be useful in processing genomic data, it's helpful to first +//! understand the overall design of this software library. +//! +//! Nearly all range data is loaded into GRanges using **parsing iterators** ([parsers]). +//! Parsing iterators read input data from some bioinformatics file format (e.g. `.bed`, +//! `.gtf.gz`, etc) and it then goes down one of two paths: +//! +//! 1. Streaming mode: the entire data set is never read into memory all at once, but rather +//! is processed entry by entry. +//! +//! 2. In-memory mode: the entire data set *is* read into memory all at once, usually as +//! a [`GRanges`] object. The [`GRanges`] object is composed of two parts: +//! a *ranges container* ([ranges]) and a *data container* ([data]). +//! [`GRanges`] objects then provide high-level in-memory methods for working with this +//! genomic data. +//! +//! Both modes then write something to output: a TSV of some statistic calculated on the +//! genomic data, the output of 10 million block bootstraps, or the coefficients of +//! linear regressions run every kilobase on sequence data. +//! +//! The GRanges workflow idea is illustrated in this (super cool) ASCII diagram: +//! +//! +//! ```text +//! +-------+ +-------------------+ +----------------------+ +//! | Input | -----> | Iterator | -----> | GRanges object | +//! +------ + | (streaming mode) | | (in-memory mode) | +//! +-------------------+ +----------------------+ +//! | | +//! | | +//! v v +//! +-----------------+ +-----------------+ +//! | Data Processing | | Data Processing | +//! +-----------------+ +-----------------+ +//! | | +//! v v +//! +--------+ +--------+ +//! | Output | | Output | +//! +--------+ +--------+ +//! +//! ``` +//! +//! Nearly all common data processing and analytics tasks in science can be thought of as taking +//! some raw input data and refining it down some pipeline, until eventually outputting it as some sort of statistical +//! result. Bioinformatics has been built on composing workflows out of command line tools and +//! pipes. Analogously, tidyverse unifies data tidying, summarizing, and modeling by passing data +//! in a datafame down a similar pipeline-based interface. Both operations are simple to use +//! because they compose the same essential *operations* into specific data processing workflows. +//! +//! GRanges is designed to emulate these powerful data analytics tools. It implements core +//! operations on genomic ranges and their associated data, and provides generic, extensible +//! data structures for storing genomic data. +//! +//! GRanges attempts to implement the same core set of data joining and processing tools as the +//! [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html) library and +//! R's [tidyverse](https://www.tidyverse.org). It also frames these genomic operations in +//! standard data analytic and database terms. For example, the operation `bedtools intersect -wa -u -a +//! -b ` finds all ranges in the *left* set of genomic ranges that overlaps one or more +//! *right* genomic ranges, and reports only the unique left range. This operation is common in +//! many data analytics workflows and database queries, and is known as a *semi join* between a +//! a left table and right table. (see for example this section in Hadley Wickham's book [R for Data Science](https://r4ds.hadley.nz/joins.html#filtering-joins)). +//! In the `granges` command line tool, this is implemented in the `granges filter` subcommand. In +//! the library, this subcommand is just the method [`GRanges::filter_overlaps()`]. +//! +//! +//! ## Documentation Guide +//! +//! - ⚙️ : technical details that might only be useful to developers handling tricky cases. +//! - ⚠️: indicates a method or type that is unstable (it may change in future implementations), +//! or is "sharp" (developer error could cause a panic). +//! - 🚀: optimization tip. +//! +//! [`GRanges`]: crate::granges::GRanges +//! [`GRanges::filter_overlaps()`]: crate::granges::GRanges +//! [`GRanges`]: crate::granges::GRanges +//! [parsers]: crate::io::parsers + pub use indexmap; pub mod data; @@ -19,9 +115,14 @@ pub mod traits; pub mod commands; pub mod reporting; +/// The main position type in GRanges. pub type Position = u32; -pub type PositionOffset = i32; // signed variant +/// The main *signed* position type in GRanges, to represent offsets (e.g. +/// for adjust range coordinates, etc). +pub type PositionOffset = i32; + +/// The main exports of the GRanges library. pub mod prelude { pub use crate::error::GRangesError; pub use crate::granges::{GRanges, GRangesEmpty}; @@ -45,7 +146,9 @@ An internal error has occurred. Please file a GitHub issue at https://github.com/vsbuffalo/granges/issues "#; -/// Create and [`indexmap::IndexMap`] of sequence names and their lengths. +/// Create an [`IndexMap`] of sequence names and their lengths. +/// +/// [`IndexMap`]: indexmap::IndexMap #[macro_export] macro_rules! seqlens { ($($key:expr => $value:expr),* $(,)?) => { @@ -53,7 +156,10 @@ macro_rules! seqlens { }; } -/// Create a new `GRanges` with sequence length information (used primarily for small examples) +/// Create a new [`GRanges`] with sequence length information (used primarily for small examples) +/// +/// [`GRanges`]: crate::granges::GRanges +/// #[macro_export] macro_rules! create_granges_with_seqlens { ($range_type:ty, $data_type:ty, { $($chr:expr => [$(($start:expr, $end:expr, $data:expr)),*]),* }, seqlens: { $($chr_len:expr => $len:expr),* }) => { @@ -74,7 +180,7 @@ macro_rules! create_granges_with_seqlens { }; } -/// a version of assert_eq! that is more user-friendly. +/// a version of [assert_eq!] that is more user-friendly. #[macro_export] macro_rules! ensure_eq { ($left:expr, $right:expr $(,)?) => ({ diff --git a/src/main/mod.rs b/src/main/mod.rs index 6ee821d..e33d9d0 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -227,9 +227,9 @@ fn run() -> Result<(), GRangesError> { skip_missing, in_mem, }) => { - + unimplemented!() } - + #[cfg(feature = "dev-commands")] Some(Commands::RandomBed { genome, diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index f355d53..b5db7be 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -1,3 +1,7 @@ +//! The [`COITrees`] type. +//! +//! This wraps the functionality og the [`coitrees`] library by Daniel C. Jones. +//! use coitrees::{BasicCOITree, GenericInterval, Interval, IntervalNode, IntervalTree}; use crate::{ @@ -59,12 +63,12 @@ impl COITrees { self.ranges.query_count(first, end - 1) } - /// Return the number of ranges in this [`COITreeRangeContainer`] container. + /// Return the number of ranges in this [`COITrees`] container. pub fn len(&self) -> usize { self.ranges.len() } - /// Return whether the [`COITreeRangeContainer`] object is empty (contains no ranges). + /// Return whether the [`COITrees`] object is empty (contains no ranges). pub fn is_empty(&self) -> bool { self.len() == 0 } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 42a11a0..45505dd 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -1,4 +1,4 @@ -//! Range and Range Containers. +//! Range types and Range Containers. //! //! diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index fd746ec..7dbfa6e 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -1,11 +1,9 @@ -//! Range Operations. +//! Range operations. //! -//! - [`adjust()`]: Adjust range start and end positions. - use crate::{traits::AdjustableGenericRange, Position, PositionOffset}; -/// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] -/// and returning `None` if the range has zero width after adjustment. +/// Adjusts the start and end coordinates of a range, ensuring the adjusted range is +/// within [0, length] and returning `None` if the range has zero width after adjustment. pub fn adjust_range( mut range: R, start_delta: PositionOffset, diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index 7213b0f..34fb825 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -1,3 +1,5 @@ +//! The [`VecRanges`] type, and the [`VecRangesIndexed`] and [`VecRangesEmpty`] type aliases. +//! use super::operations::adjust_range; use super::{validate_range, RangeEmpty, RangeIndexed}; use crate::traits::{ diff --git a/src/reporting.rs b/src/reporting.rs index 6f2ef02..3fd7353 100644 --- a/src/reporting.rs +++ b/src/reporting.rs @@ -1,6 +1,14 @@ -//! Centralized reporting to the user about e.g. fragile operations. +//! Types for standardized reports to the user about range operations. +//! +//! ⚠️: This is still under development. +//! +//! The goal of this is to encourage and facilitate developers of bioinformatics +//! tools to report information about potentially fragile operations, or inform +//! them of e.g. how many ranges were filtered out by some operation. //! +/// The [`CommandOutput`] type output is generic over some data output +/// from a command, and a [`Report`] that reports information to the user. #[allow(unused)] pub struct CommandOutput { value: U, @@ -13,6 +21,7 @@ impl CommandOutput { } } +/// A type to (semi) standardize reporting to the user. #[derive(Default)] pub struct Report { entries: Vec, diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 232e14a..ac48ae3 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -84,7 +84,7 @@ pub fn random_vecranges(n: usize) -> VecRanges { vr } -/// Build a random [`GRanges, diff --git a/src/traits.rs b/src/traits.rs index 812bbf4..1966830 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -18,6 +18,8 @@ use crate::{ /// The [`AsGRangesRef`] trait improves the ergonomics of working /// with both [`GRanges`] and [`GRangesEmpty`] function arguments. +/// +/// [`GRangesEmpty`]: crate::granges::GRanges pub trait AsGRangesRef<'a, C, T> { fn as_granges_ref(&'a self) -> &'a GRanges; } @@ -95,6 +97,9 @@ pub trait AdjustableGenericRange: GenericRange { /// Defines functionality common to all range containers, e.g. [`VecRanges`] and /// [`COITrees`]. +/// +/// [`VecRanges`]: crate::ranges::vec::VecRanges +/// [`COITrees`]: crate::ranges::coitrees::COITrees pub trait RangeContainer { fn len(&self) -> usize; fn is_empty(&self) -> bool { @@ -103,10 +108,13 @@ pub trait RangeContainer { fn sequence_length(&self) -> Position; } +/// Marker trait for data container. +/// This is presently not used for much, but is useful to indicate certain +/// type combinations are data containers. pub trait DataContainer {} -/// Defines functionality for filtering [`RangeRecord`] entries based on their -/// sequence names. [`RangeRecord`] are predominantly used in reading in data, +/// Defines functionality for filtering [`GenomicRangeRecord`] entries based on their +/// sequence names. [`GenomicRangeRecord`] are predominantly used in reading in data, /// so these trait methods simplify excluding or retaining ranges based on what /// sequence (i.e. chromosome) they are on. pub trait GeneralRangeRecordIterator: @@ -141,7 +149,7 @@ where fn iter_ranges(&self) -> Box + '_>; } -/// The [`IntoIterableRangeContainer`] trait defines common functionality for *consuming* iterating +/// The [`IntoIterableRangesContainer`] trait defines common functionality for *consuming* iterating /// over the range types in range containers. pub trait IntoIterableRangesContainer { fn into_iter_ranges(self) -> Box>;