From 9d1cf48add93c8a5737a9126b383f850a3242ce8 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sun, 25 Feb 2024 18:28:43 -0800 Subject: [PATCH] Benchmark windows and lots of doc clarity updates --- benches/bedtools_comparison.rs | 48 ++++++- src/granges.rs | 8 +- src/io/parsers.rs | 29 ----- src/lib.rs | 232 ++++++++++++++++++++++++++------- src/sequences/mod.rs | 11 +- src/traits.rs | 14 +- 6 files changed, 244 insertions(+), 98 deletions(-) diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index 8d5a677..8f281c5 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -18,7 +18,7 @@ fn bench_range_adjustment(c: &mut Criterion) { let input_bedfile = random_bed3file(BED_LENGTH); // configure the sample size for the group - // group.sample_size(10); + group.sample_size(10); // bedtools slop group.bench_function("bedtools_slop", |b| { @@ -145,10 +145,54 @@ fn bench_flank(c: &mut Criterion) { }); } +fn bench_windows(c: &mut Criterion) { + let width = 100_000; + let step = 1_000; + + // create the benchmark group + let mut group = c.benchmark_group("windows"); + + // configure the sample size for the group + // group.sample_size(10); + group.bench_function("bedtools_makewindows", |b| { + b.iter(|| { + let bedtools_output = Command::new("bedtools") + .arg("makewindows") + .arg("-g") + .arg("tests_data/hg38_seqlens.tsv") + .arg("-w") + .arg(width.to_string()) + .arg("-s") + .arg(step.to_string()) + .output() + .expect("bedtools makewindows failed"); + assert!(bedtools_output.status.success()); + }); + }); + + group.bench_function("granges_windows", |b| { + b.iter(|| { + let granges_output = Command::new(granges_binary_path()) + .arg("windows") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--width") + .arg(width.to_string()) + .arg("--step") + .arg(step.to_string()) + .output() + .expect("granges windows failed"); + assert!(granges_output.status.success()); + }); + }); +} + + criterion_group!( benches, bench_filter_adjustment, bench_range_adjustment, bench_flank, -); + bench_windows, + ); criterion_main!(benches); diff --git a/src/granges.rs b/src/granges.rs index 442a24a..25dd1cd 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -632,7 +632,7 @@ where /// [`GRanges`]. /// /// The [`JoinData`] container contains the owned left data container and has - /// a reference to the right data container, as as well as a [`Vec`] /// that contains information about each overlap between a left and zero or more right /// ranges. fn left_overlaps( @@ -674,7 +674,7 @@ where /// [`GRanges`]. /// /// The [`JoinData`] container contains the left data container and has - /// a reference to the right data container, as as well as a [`Vec`] /// that contains information about each overlap between a left and zero or more right /// ranges. fn left_overlaps( @@ -726,7 +726,7 @@ where /// [`GRanges`]. /// /// The [`JoinDataLeftEmpty`] contains no left data, and a reference to the - /// right data container, as as well as a [`Vec`] /// that contains information about each overlap between a left and zero or more right /// ranges. fn left_overlaps( @@ -778,7 +778,7 @@ where /// [`GRanges`]. /// /// The [`JoinDataBothEmpty`] contains no data, since neither left of right - /// [`GRanges`] objects had data. However, it does contain a [`Vec`], /// and each [`LeftGroupedJoin`] contains information about the number of overlapping /// ranges and their lengths. This can be used to summarize, e.g. the number /// of overlapping basepairs, the overlap fraction, etc. diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 117fed8..b16cf48 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -70,35 +70,6 @@ //! iterator types defined in [`GenomicRangesParser`]. The best examples of this are the //! `granges` subcommands implementations. //! -//! ## File Format Terminology -//! -//! - BED3 -//! - BED* -//! - BED-like -//! -//! -//! : it could be ranges-only (i.e. a BED3), or contain data (e.g. BED5). The lazy BED parser will -//! output a [`GenomicRangeRecord>`], where the data would be `None` only in the -//! case that three columns were encountered in the file (which must be a BED3). -//! -//! In GRanges, there are two types of ranges: ranges with an index to an element in the data -//! container, and ranges without indices (i.e. what we would use in the case of processing a BED3 -//! file). Since a [`GRanges`] object needs to have a single, concrete range type in its range -//! containers, it must be known *at compile time* how one should convert the -//! -//! Downstream pipelines must immediately determine how to handle whether there is additional data, -//! or all of the [`GenomicRangeRecord`] entries are `None`, and -//! -//! -//! # BED-like File Parser Design -//! -//! All BED formats (BED3, BED5, etc) are built upon a BED3. Often when working with these types -//! of formats, many operations do not immediately require full parsing of the line, past the -//! *range components*. This is because downstream operations may immediately filter away entries -//! (e.g. based on width), or do an overlap operation, and then filter away entries based on some -//! overlap criteria. Either way, it may be advantageous to work with ranges that just store some -//! generic data. -//! //! [`GRanges`]: crate::granges::GRanges //! [`GRanges`]: crate::granges::GRanges //! [`GRangesEmpty`]: crate::granges::GRangesEmpty diff --git a/src/lib.rs b/src/lib.rs index c4b41c4..64af9f1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,30 +5,128 @@ //! # GRanges: Generic Genomic Range and Data Containers //! //! GRanges is a Rust library for working with genomic ranges and their associated data. GRanges -//! also provides a separate command line tool built using this library, which currently -//! implements a subset of the features of other great bioinformatics utilities like -//! [bedtools](https://bedtools.readthedocs.io/en/latest/). -//! -//! The idea of GRanges is to facilitate building a new generation of robust, reproducible, and -//! easily installable bioinformatics tools in Rust. It aims to lower the barrier to building -//! simple bioinformatics data processing tools in Rust for biologists that are interested -//! in using the language in their work. Even biologists that have no interest in learning Rust -//! should benefit from Rust-based tools since they are fast. Currently GRanges is engineered -//! as a set of compile-time generic types and methods, though future versions may implement -//! analagous runtime dynamic data structures that would allow the library to be wrapped and used -//! by other languages. +//! also implements a [bedtools](https://bedtools.readthedocs.io/en/latest/)-like command line tool +//! as a test and benchmark of the library. In benchmarks, this command line tool is reliably +//! 30-40% faster than bedtools. Internally, GRanges uses the very fast +//! [coitrees](https://github.com/dcjones/coitrees) library written by Daniel C. Jones for overlap +//! operations. +//! +//! The GRanges library aims to simplify the creation of powerful, performant genomics tools in +//! Rust. GRanegs is inspired by the design and ease of use of Bioconductor's +//! [GenomicRanges](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003118) +//! and [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html) +//! packages, but tools using GRanges are optimized at compile time and thus are typically much +//! faster. +//! +//! ## GRanges Design +//! +//! Nearly all common data processing and analytics tasks in science can be thought of as taking +//! some raw input data and refining it down some pipeline, until eventually outputting some sort +//! of statistical result. In bioinformatics, genomics data processing workflows are often composed +//! from command line tools and Unix pipes. Analogously, tidyverse unifies data tidying, +//! summarizing, and modeling by passing and manipulating a datafame down a similar pipeline. Unix +//! pipes and tidyverse are powerful because they allow the scientist to build highly specialized +//! analytic tools just by remixing the same core set of data operations. +//! +//! GRanges implements many of the same core data joining and manipulation operations as +//! [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html) library and +//! R's [tidyverse](https://www.tidyverse.org). +//! +//! ## The [`GRanges`] container +//! +//! Central to the GRanges library is the generic [`GRanges`] data container. This is generic +//! over the *range container* type `R`, because range data structures have inherent performance +//! tradeoffs for different tasks. For example, when parsing input data into memory, the total size +//! is often not known in advanced. Consequently, the most efficient range container data structure +//! is a dynamically-growing [`Vec`]. However, when overlaps need to be computed between ranges +//! across two containers, the *interval tree* data structures are needed. [`GRanges`] supports +//! conversion between these range container types. +//! +//! [`GRanges`] is also generic over its *data container*, `T`. This allows it to hold *any* +//! type of data, in any data structure: a [`Vec`] of numeric stores, an +//! [`ndarray::Array2`](https://docs.rs/ndarray/latest/ndarray/index.html) matrix, a +//! [polars](https://pola.rs) dataframe, custom `struct`s, etc. Each range in the ranges container +//! indexes an element in the data container. There is also a special [`GRangesEmpty`] type for +//! data-less [`GRanges`] objects. +//! +//! The [`GRanges`] type implements a small but powerful set of methods that can be used to process +//! and analyze genomic data. GRanges relies heavily on *method-chaining* operations that pass +//! genomic range data down a pipeline, similar to tidyverse and Unix pipes. +//! +//! ## The Overlaps--Map-Combine Pattern +//! +//! A very common data analysis strategy is **Split-Apply-Combine** pattern ([Wickham, +//! 2011](https://vita.had.co.nz/papers/plyr.html)). When working with genomic overlaps +//! and their data, GRanges library relies on its own data analysis pattern known +//! as **Overlaps--Map-Combine** pattern, which is better suited for common genomic operations and analyses. +//! +//! ### Overlap Joins and the Left Grouped Join +//! +//! In this pattern, the *left* genomic ranges are "joined" by their *overlaps* with another +//! *right* genomic ranges. In other words, the *right ranges* that overlap a single *left range* can +//! be thought of "joined" to this left range. +//! +//! With genomic data, downstream processing is greatly simplified if the results of this join +//! are *grouped* by what left range they overlap (see illustration below). This is a special +//! kind of join known as a **left grouped join**. +//! +//! +//! ```text +//! Left range: ███████████████████████████████████ +//! +//! Right ranges: ██ ██████ ██████ ██████ +//! ``` +//! +//! In GRanges, a left grouped join ([`GRanges::left_overlaps()`]) returns a [`GRanges`] object +//! containing the exact same ranges as the input *left ranges*. In other words, *left grouped +//! joins* are *endomorphic in the range container*. Computationally, this is also extremely +//! efficient, because the left ranges can be passed through to the results directly (a +//! zero-overheard operation). +//! +//! +//! +//! The [`GRanges`] object returned by [`GRanges::left_overlaps()`] contains a [`JoinData`] (or related) +//! type as its data container. The [`JoinData`] contains information about each left range's overlaps (e.g. number of +//! overlapping bases, fraction of overlaps, etc) in a [`Vec`]. This information about +//! overlapping ranges may then be used by downstream calculations. +//! Additionally [`JoinData`] stores the left ranges' data container, and has a *reference* to +//! the right ranges' data container. In both cases, the data is *unchanged* by the join. +//! Downstream processing can also easily access the left ranges's data and all overlapping right ranges' +//! data for calculations. +//! +//! ### Map-Combine over Joins +//! +//! After a left grouped join, each left range can have zero or more overlapping right ranges. +//! A *map-combine* operation (optionally) maps a function over all right ranges' data and +//! overlaps information, and then combines that into a single data entry per left range. These +//! combined data entries make up a new [`GRanges>`] data container, returned by [`GRanges::map_over_joins()`]. +//! +//! Note that like [`GRanges::left_overlaps()`], the [`GRanges::map_over_joins()`] is endomorphic +//! over its range container. This means it can be passed through without modification, which +//! is computationally efficient. This results from a Map-Combine operation then can be overlap joined with +//! other genomic ranges, filtered, have its data arbitrarily manipulated by [`GRanges::map_data()`], etc. //! //! ## Example //! -//! The best introduction to the GRanges library is an example illustrating -//! how a common genomic data processing task can be expressed in few lines -//! of (relatively simple) code. Below is an annotated implementation of -//! `granges map`, which provides the same functionality of `bedtools map`: -//! arbitrary operations (e.g. mean, median, max, etc) are applied to the numeric -//! scores of all right ranges that overlap a each left range: +//! To illustrate these ideas, lets look at an example of how we might use GRanges to do a +//! a commonly-encountered genomic calculation. Suppose you had a set of exon genomic ranges (the *left +//! ranges*) and a multicolumn TSV (*right ranges*) of various genomic features (e.g. assay +//! results, recombination hotspots, variants, etc). Imagine you wanted to form some statistic per +//! each exon, based on data in the overlapping right ranges. This operation is very +//! similar to `bedtools map`, except that suppose the statistic you want to calculate is not one of the +//! few offered by bedtools. GRanges makes it simple to compose your own, very fast genomic tools to answer these +//! questions. +//! +//! To see how GRanges would make it simple to compose a specialized fast tool to solve this +//! problem, let's fist see how few lines of code it would take to implement `bedtools map`, +//! since our problem is akin to using `bedtools map` with our own function to calculate our statistic. +//! Let's start by getting the mean score by seeing how to get the mean BED5 score across all +//! overlapping right ranges for each left range (i.e. `bedtools map -a -b -c 5 mean`). +//! Here is the Rust code to do this using GRanges: //! //! ``` -//! use granges::prelude::*; +//! # use granges::prelude::*; +//! # fn try_main() -> Result<(), granges::error::GRangesError> { //! //! // Data for example: //! let genome = seqlens!("chr1" => 100, "chr2" => 100); @@ -39,10 +137,8 @@ //! let seqnames: Vec = genome.keys().cloned().collect(); //! //! // Create parsing iterators to the left and right BED files. -//! let left_iter = Bed3Iterator::new(left_path) -//! .expect("error inferring filetype"); -//! let right_iter = Bed5Iterator::new(right_path) -//! .expect("error inferring filetype"); +//! let left_iter = Bed3Iterator::new(left_path).expect("error inferring filetype"); +//! let right_iter = Bed5Iterator::new(right_path).expect("error inferring filetype"); //! //! // Filter out any ranges from chromosomes not in our genome file. //! let left_gr = GRangesEmpty::from_iter(left_iter.retain_seqnames(&seqnames), &genome) @@ -79,14 +175,17 @@ //! // for missing values, etc. //! let path = Some("map_results.bed.gz"); //! result_gr.to_tsv(path, &BED_TSV).expect("error writing output"); +//! # Ok(()) +//! # } +//! # fn main() { try_main().unwrap(); } //! ``` //! -//! ## Design Overview +//! In relatively few lines of code, we can implement core functionality of bedtools. +//! As mentioned earlier, GRanges code typically runs 30-40% faster than bedtools. //! -//! To understand how GRanges could be useful in processing genomic data, it's helpful to first -//! understand the overall design of this software library. +//! ## Loading Data into GRanges //! -//! Nearly all range data is loaded into GRanges using **parsing iterators** (see the [parsers] +//! Nearly all genomic range data enters GRanges through **parsing iterators** (see the [parsers] //! module). Parsing iterators read input data from some bioinformatics file format (e.g. `.bed`, //! `.gtf.gz`, etc) and it then goes down one of two paths: //! @@ -99,8 +198,12 @@ //! [`GRanges`] objects then provide high-level in-memory methods for working with this //! genomic data. //! -//! Both modes then eventually write something to output: a TSV of some statistic calculated on the -//! genomic data, the output of 10 million block bootstraps, or the coefficients of +//! In the example above, we loaded the items in the parsing iterators directly into +//! [`GRanges`] objects, since we had to do overlap operations (in the future, GRanges will +//! support streaming overlap joins for position-sorted input data). +//! +//! Both processing modes eventually output something, e.g. a TSV of some statistic calculated +//! on the genomic data, the output of 10 million block bootstraps, or the coefficients of //! linear regressions run every kilobase on sequence data. //! //! The GRanges workflow idea is illustrated in this (super cool) ASCII diagram: @@ -125,28 +228,41 @@ //! //! ``` //! -//! Nearly all common data processing and analytics tasks in science can be thought of as taking -//! some raw input data and refining it down some pipeline, until eventually outputting it as some sort of statistical -//! result. Bioinformatics has been built on composing workflows out of command line tools and -//! pipes. Analogously, tidyverse unifies data tidying, summarizing, and modeling by passing data -//! in a datafame down a similar pipeline-based interface. Both operations are simple to use -//! because they compose the same essential *operations* into specific data processing workflows. +//! ## Manipulating GRanges objects //! -//! GRanges is designed to emulate these powerful data analytics tools. It implements core -//! operations on genomic ranges and their associated data, and provides generic, extensible -//! data structures for storing genomic data. +//! 1. *Creation*: [`GRanges::new_vec()`], [`GRanges::from_iter()`], [`GRangesEmpty::from_windows()`]. //! -//! GRanges attempts to implement the same core set of data joining and processing tools as the -//! [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html) library and -//! R's [tidyverse](https://www.tidyverse.org). It also frames these genomic operations in -//! standard data analytic and database terms. For example, the operation `bedtools intersect -wa -u -a -//! -b ` finds all ranges in the *left* set of genomic ranges that overlaps one or more -//! *right* genomic ranges, and reports only the unique left range. This operation is common in -//! many data analytics workflows and database queries, and is known as a *semi join* between a -//! a left table and right table. (see for example this section in Hadley Wickham's book [R for Data Science](https://r4ds.hadley.nz/joins.html#filtering-joins)). -//! In the `granges` command line tool, this is implemented in the `granges filter` subcommand. In -//! the library, this subcommand is just the method [`GRanges::filter_overlaps()`]. +//! 2. *Range-modifying* functions: [`GRanges::into_coitrees()`], [`GRanges::adjust_ranges()`], [`GRanges::sort()`], +//! [`GRanges::flanking_ranges()`], [`GRanges::filter_overlaps()`]. +//! +//! 3. *Data-modifying* functions: [`GRanges::left_overlaps()`], [`GRanges::map_over_joins()`], [`GRanges::map_data()`]. +//! +//! 4. *Range and Data modifying* functions: [`GRanges::push_range()`]. +//! +//! Note that not all range and data container types support these operations, e.g. +//! [`GRanges::push_range()`] is not available for ranges stored in an interval tree. +//! However, by design, this is known *at compile time*, due Rust's typing system. Thus +//! there is lower risk of runtime panics, since more potential issues are caught at +//! compile time. +//! +//! ## The (Active) Future +//! +//! When I was first learning Rust as a developer, one thing that struck me about the language +//! is that it feels *minimal but never constraining*. This is quite a contrast compared to +//! languages like C++. As Rust developers know, the minimalness was by design; the +//! cultural norm of slow growth produced a better final product in the end. +//! +//! The GRanges library attempts to follow this design too. Currently, the alpha release is +//! about implementating a subset of essential core functionality to benchmark against +//! alternative software. Since the design and ergonomics of the API are under active development, +//! please, *please*, file a GitHub issue if: +//! +//! 1. You want a particular feature. +//! +//! 2. You don't know how you'd implement a feature yourself with GRanges. //! +//! 3. The ["ergonomics"](https://blog.rust-lang.org/2017/03/02/lang-ergonomics.html) don't +//! feel right. //! //! ## Documentation Guide //! @@ -155,10 +271,26 @@ //! or is "sharp" (developer error could cause a panic). //! - 🚀: optimization tip. //! +//! [`JoinData`]: crate::join::JoinData //! [`GRanges`]: crate::granges::GRanges -//! [`GRanges::filter_overlaps()`]: crate::granges::GRanges +//! [`GRangesEmpty`]: crate::granges::GRangesEmpty +//! [`GRanges::filter_overlaps()`]: granges::GRanges::filter_overlaps //! [`GRanges`]: crate::granges::GRanges //! [parsers]: crate::io::parsers +//! [`ndarray::Array2`]: ndarray::Array2 +//! [`GRanges::left_overlaps()`]: crate::traits::LeftOverlaps::left_overlaps +//! [`GRanges>`]: crate::granges::GRanges +//! [`GRanges::map_over_joins()`]: crate::granges::GRanges::map_over_joins +//! [`GRanges::map_data()`]: crate::granges::GRanges::map_data +//! [`GRanges::new_vec()`]: crate::granges::GRanges::new_vec +//! [`GRanges::into_coitrees()`]: crate::granges::GRanges::into_coitrees +//! [`GRanges::adjust_ranges()`]: crate::granges::GRanges::adjust_ranges +//! [`GRanges::push_range()`]: crate::granges::GRanges::push_range +//! [`GRanges::flanking_ranges()`]: crate::granges::GRanges::flanking_ranges +//! [`GRanges::sort()`]: crate::granges::GRanges::sort +//! [`GRanges::from_iter()`]: crate::granges::GRanges::from_iter +//! [`GRangesEmpty::from_windows()`]: crate::granges::GRangesEmpty::from_windows +// TODO broken links, gr! pub use indexmap; @@ -183,7 +315,7 @@ pub mod reporting; /// This type is currently an unwrapped [`u32`]. This should handle /// chromosome lengths for nearly all species. In fact, the only exception /// known so far is lungfush (*Neoceratodus forsteri*), which has a chromosomes -/// that reaches 5.4Gb (https://www.nature.com/articles/s41586-021-03198-8l). +/// that reaches 5.4Gb (). /// The [`u32::MAX`] is 4,294,967,295, i.e. 4.29 Gigabases, which means [`u32`] is /// just barely suitable for even the largest known chromosome. There is a /// performance and memory-efficiency tradeoff when using [`u64`] over [`u32`], diff --git a/src/sequences/mod.rs b/src/sequences/mod.rs index 6bfc124..3cceb40 100644 --- a/src/sequences/mod.rs +++ b/src/sequences/mod.rs @@ -10,18 +10,17 @@ //! ## Main Functionality //! //! - Lazy-loading by sequence (e.g. chromosome) with the [`LazyLoader`] type. +//! //! - If the `--features=ndarray` is set, one and two dimensional per-basepair -//! numeric arrays/matrices *in-memory*, with [`NumericSequences1`] and -//! [`NumericSequences2`], or *lazy loaded* with [`LazyNumericSequences2`] -//! ([``LazyNumericSequences1`] will be added soon, please file a [GitHub -//! issue](https://github.com/vsbuffalo/granges/issues) if you need it). +//! numeric arrays/matrices *in-memory*, with `NumericSequences1` and +//! `NumericSequences2`, or *lazy loaded* with `LazyNumericSequences2` +//! (`LazyNumericSequences1` will be added soon, please file a [GitHub +//! issue](https://github.com]/vsbuffalo/granges/issues) if you need it). //! //! //! [`Sequences`]: crate::traits::Sequences //! [`LazyLoader`]: crate::sequences::lazy //! -//! [`NumericSequences1`]: crate::sequences::numeric::NumericSequences1 -//! [`NumericSequences2`]: crate::sequences::numeric::NumericSequences2 pub mod lazy; pub mod nucleotide; diff --git a/src/traits.rs b/src/traits.rs index 2d71aba..e76c5ec 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -89,11 +89,12 @@ pub trait GenericRange: Clone { /// The [`JoinDataOperations`] trait unifies common operations /// over combined join data types ([`CombinedJoinData`], -/// CombinedJoinDataBothEmpty`], etc). +/// [`CombinedJoinDataBothEmpty`], etc). /// /// -/// [`CombinedJoinData`] crate::granges::join::CombinedJoinData -/// [`CombinedJoinDataBothEmpty`] crate::granges::join::CombinedJoinDataBothEmpty +/// [`JoinDataOperations`]: crate::traits::JoinDataOperations +/// [`CombinedJoinData`]: crate::join::CombinedJoinData +/// [`CombinedJoinDataBothEmpty`]: crate::join::CombinedJoinDataBothEmpty pub trait JoinDataOperations { type LeftDataElementType; type RightDataElementType; @@ -209,10 +210,9 @@ pub trait IntoIterableRangesContainer { /// /// GRanges provides [`IndexedDataContainer`] trait implementations for: /// -/// - [`ndarray::Array1`] -/// - [`ndarray::Array2`] -/// - [`Vec`] -/// - [`polars::DataFrame`] +/// - [`Vec`], this is the most common and general data container. +/// - [ndarray](https://docs.rs/ndarray/latest/ndarray/index.html) `Array1` and `Array2`, if `--features=ndarray` is set. +/// - [polars](https://pola.rs) datafames. /// /// # Panics /// This will panic if `DataContainer.get_value()` tries to access