diff --git a/Cargo.toml b/Cargo.toml index d5ab0b6..3ebda50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ bytes = "1.5.0" ndarray-npy = { version = "0.8.1", optional = true } num-traits = "0.2.18" lazy_static = "1.4.0" +medians = "3.0.6" [features] # cli = [ "clap" ] # TODO make feature diff --git a/README.md b/README.md new file mode 100644 index 0000000..2f649bf --- /dev/null +++ b/README.md @@ -0,0 +1,137 @@ +![CI tests](https://github.com/vsbuffalo/granges/workflows/Rust/badge.svg) + +## The GRanges Rust library and command line tool + +GRanges is a Rust library for working with genomic ranges and their associated +data. It aims to make it easy to write extremely performant genomics tools that +work with genomic range data (e.g. BED, GTF/GFF, VCF, etc). Internally, GRanges +uses the *very* fast [coitrees](https://github.com/dcjones/coitrees/) interval +tree library written by Daniel C. Jones for overlap operations. In preliminary +benchmarks, GRanges tools can be 10%-30% faster than similar functionality in +[bedtools2](https://github.com/arq5x/bedtools2) (see benchmark and caveats +below). + +GRanges is inspired by ["tidy"](https://www.tidyverse.org) data analytics +workflows, as well as Bioconductor's +[GenomicRanges](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003118) +and +[plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html). +GRanges uses a similar *method-chaining* pipeline approach to manipulate +genomic ranges, find overlapping genomic regions, and compute statistics. +For example, you could implement your own `bedtools map`-like functionality +in relatively few lines of code: + +```rust +// Create the "right" GRanges object. +let right_gr = bed5_gr + // Convert to interval trees. + .into_coitrees()? + // Extract out just the score from the additional BED5 columns. + .map_data(|bed5_cols| { + bed5_cols.score + })?; + +// Compute overlaps and combine scores into mean. +let results_gr = left_gr + // Find overlaps + .left_overlaps(&right_gr)? + // Summarize overlap data + .map_over_joins(mean_score)?; +``` + +where `mean_score()` is: + +```rust +pub fn mean_score(join_data: CombinedJoinDataLeftEmpty>) -> f64 { + // Get the "right data" out of the join -- the BED5 scores. + let overlap_scores: Vec = join_data.right_data.into_iter() + // filter out missing values ('.' in BED) + .filter_map(|x| x).collect(); + + // Calculate the mean score. + let score_sum: f64 = overlap_scores.iter().sum(); + score_sum / (overlap_scores.len() as f64) +} +``` + +Note that GRanges is a *compile-time* generic Rust library, so the code above +will be heavily optimized by the compiler. Rust uses *zero-cost abstractions*, +meaning high-level code like this is compiled and optimized so that it would be +just as performant as if it were written in a low-level language. + +GRanges is *generic* in the sense that it works with *any* data container type +that stores data associated with genomic data: a `Vec` of some type, an +[ndarray](https://docs.rs/ndarray/latest/ndarray/) `Array2`, +[polars](https://pola.rs) dataframe, etc. GRanges allows the user to write do +common genomics data processing tasks in a few lines of Rust, and then lets the +Rust compiler optimize it. + +As a proof-of-concept, GRanges also provides the command line tool `granges` +built on this library's functionality. This command line tool is intended for +benchmarks against comparable command line tools and for large-scale +integration tests against other software to ensure that GRanges is bug-free. +The `granges` tool currently provides a subset of the features of other great +bioinformatics utilities like +[bedtools](https://bedtools.readthedocs.io/en/latest/). + +## Preliminary Benchmarks + +In an attempt to combat "benchmark hype", this section details the results of +some preliminary benchmarks in an honest and transparent way. On our lab +server, here are two runs with 100,000 ranges per operation, and n = 100 samples: + +``` +# run 1 +command bedtools time granges time granges speedup (%) +------------ --------------- -------------- --------------------- +map_multiple 293.41 s 129.99 s 55.6963 +map_max 131.97 s 111.39 s 15.5938 +adjust 127.36 s 59.25 s 53.4811 +filter 113.70 s 101.01 s 11.1607 +map_min 116.84 s 108.16 s 7.42583 +flank 150.95 s 86.25 s 42.8602 +map_mean 116.50 s 143.70 s -23.3524 +map_sum 110.60 s 111.39 s -0.71377 +windows 476.12 s 67.82 s 85.7555 +map_median 154.51 s 104.37 s 32.4551 + +# run 2 +command bedtools time granges time granges speedup (%) +------------ --------------- -------------- --------------------- +map_multiple 348.14 s 124.29 s 64.2979 +map_max 126.27 s 107.05 s 15.2267 +adjust 126.51 s 59.23 s 53.1802 +filter 114.66 s 97.28 s 15.1542 +map_min 118.61 s 113.02 s 4.72037 +flank 165.31 s 80.97 s 51.0211 +map_mean 133.39 s 108.22 s 18.867 +map_sum 120.28 s 128.68 s -6.98736 +windows 476.22 s 90.89 s 80.9151 +map_median 106.83 s 104.37 s 2.30808 +``` + +The worse performance of `map_mean` in run 1, upon closer inspection, is +largely driven by [aberrant +replicates](https://github.com/vsbuffalo/granges/issues/2). (The server is +under heavy use by others, which adds variance to these benchmarks.) Note too +that unlike `bedtools`, `granges` *always* requires a genome file of chromosome +lengths for validation; this disk I/O adds to GRange's benchmark times. + +Here is another benchmark with smaller test datasets (10,000 ranges) and many +more replicates: + +``` +command bedtools time granges time granges speedup (%) +------------ --------------- -------------- --------------------- +map_multiple 133.41 s 77.81 s 41.6736 +adjust 60.23 s 40.06 s 33.4794 +map_min 67.11 s 59.31 s 11.6281 +map_mean 64.76 s 59.29 s 8.43342 +map_max 67.04 s 59.30 s 11.5468 +map_sum 66.89 s 60.01 s 10.2835 +map_median 65.63 s 59.47 s 9.37911 +flank 84.42 s 57.62 s 31.7415 +filter 77.95 s 56.65 s 27.3318 +windows 291.41 s 51.47 s 82.3359 +``` + diff --git a/examples/mean_score.rs b/examples/mean_score.rs new file mode 100644 index 0000000..f8beb3c --- /dev/null +++ b/examples/mean_score.rs @@ -0,0 +1,47 @@ +use granges::{prelude::*, join::CombinedJoinDataLeftEmpty}; + +// Our overlap data processing function. +pub fn mean_score(join_data: CombinedJoinDataLeftEmpty>) -> f64 { + // Get the "right data" -- the BED5 scores. + let overlap_scores: Vec = join_data.right_data.into_iter() + // filter out missing values ('.' in BED) + .filter_map(|x| x).collect(); + + // calculate the mean + let score_sum: f64 = overlap_scores.iter().sum(); + score_sum / (overlap_scores.len() as f64) +} + + +fn try_main() -> Result<(), granges::error::GRangesError> { + // Mock sequence lengths (e.g. "genome" file) + let genome = seqlens!("chr1" => 100, "chr2" => 100); + + // Create parsing iterators to the left and right BED files. + let left_iter = Bed3Iterator::new("tests_data/bedtools/map_a.txt")?; + let right_iter = Bed5Iterator::new("tests_data/bedtools/map_b.txt")?; + + // Filter out any ranges from chromosomes not in our genome file. + let left_gr = GRangesEmpty::from_iter(left_iter, &genome)?; + let right_gr = GRanges::from_iter(right_iter, &genome)?; + + // Create the "right" GRanges object. + let right_gr = right_gr + // Convert to interval trees. + .into_coitrees()? + // Extract out just the score from the additional BED5 columns. + .map_data(|bed5_cols| { + bed5_cols.score + })?; + + // Compute overlaps and combine scores into mean. + let results_gr = left_gr + .left_overlaps(&right_gr)? + .map_over_joins(mean_score)?; + + results_gr.to_tsv(None::, &BED_TSV)?; + Ok(()) +} + +fn main() { try_main().unwrap(); } + diff --git a/src/commands.rs b/src/commands.rs index 03ad443..be52f44 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -455,7 +455,7 @@ pub fn granges_map( // Process all the overlaps. let result_gr = left_join_gr.map_over_joins(|join_data| { // Get the "right data" -- the BED5 scores - let overlap_scores: Vec = join_data + let mut overlap_scores: Vec = join_data .right_data .into_iter() // Filter out the `None` values. @@ -465,7 +465,7 @@ pub fn granges_map( // Run all operations on the scores. operations .iter() - .map(|operation| operation.run(&overlap_scores)) + .map(|operation| operation.run(&mut overlap_scores)) .collect::>() })?; diff --git a/src/data/operations.rs b/src/data/operations.rs index 26e6d8f..983c43c 100644 --- a/src/data/operations.rs +++ b/src/data/operations.rs @@ -14,18 +14,19 @@ use super::DatumType; use crate::traits::IntoDatumType; /// Calculate the median. -/// -/// This will clone and turn `numbers` into a `Vec`. -pub fn median(numbers: &[F]) -> Option { +pub fn median(numbers: &mut [F]) -> Option { if numbers.is_empty() { return None; } - let mut numbers = numbers.to_vec(); - numbers.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mid = numbers.len() / 2; if numbers.len() % 2 == 0 { - Some((numbers[mid - 1] + numbers[mid]) / F::from(2.0).unwrap()) + numbers.select_nth_unstable_by(mid-1, |a, b| a.partial_cmp(b).unwrap()); + let lower = numbers[mid-1]; + numbers.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap()); + let upper = numbers[mid]; + Some((lower + upper) / F::from(2.0).unwrap()) } else { + numbers.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap()); Some(numbers[mid]) } } @@ -42,7 +43,8 @@ pub enum Operation { } impl Operation { - pub fn run(&self, data: &[T]) -> DatumType + #[inline(always)] + pub fn run(&self, data: &mut [T]) -> DatumType where T: Float + Sum + ToPrimitive + Clone + ToString, { @@ -88,3 +90,33 @@ impl Operation { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_median_empty() { + let mut numbers: Vec = vec![]; + assert_eq!(median(&mut numbers), None); + } + + #[test] + fn test_median_odd() { + let mut numbers = vec![2.0, 3.0, 1.0]; + assert_eq!(median(&mut numbers), Some(2.0)); + } + + #[test] + fn test_median_even() { + let mut numbers = vec![1.0, 2.0, 3.0, 4.0]; + assert_eq!(median(&mut numbers), Some(2.5)); + } + + #[test] + fn test_median_negative() { + let mut numbers = vec![-3.0, -1.0, -2.0]; + assert_eq!(median(&mut numbers), Some(-2.0)); + } + +} diff --git a/src/lib.rs b/src/lib.rs index 720730b..cedb842 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -171,7 +171,7 @@ //! let left_join_gr = left_gr.left_overlaps(&right_gr)?; //! //! // Process all the overlaps. -//! let result_gr = left_join_gr.map_over_joins(|join_data| { +//! let results_gr = left_join_gr.map_over_joins(|join_data| { //! // Get the "right data" -- the BED5 scores. //! let overlap_scores: Vec = join_data.right_data.into_iter() //! // filter out missing values ('.' in BED) @@ -185,7 +185,7 @@ //! // Write to a TSV file, using the BED TSV format standards //! // for missing values, etc. //! let tempfile = tempfile::NamedTempFile::new().unwrap(); -//! result_gr.to_tsv(Some(tempfile.path()), &BED_TSV)?; +//! results_gr.to_tsv(Some(tempfile.path()), &BED_TSV)?; //! # Ok(()) //! # } //! # fn main() { try_main().unwrap(); }