Skip to content

Commit

Permalink
Merge pull request #3 from vsbuffalo/new-io
Browse files Browse the repository at this point in the history
New csv + serde parsing, with cleaner type inference; new `TsvRecordI…
  • Loading branch information
vsbuffalo committed Feb 27, 2024
2 parents 181046b + 81226a9 commit f144ab4
Show file tree
Hide file tree
Showing 18 changed files with 613 additions and 593 deletions.
8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ bytes = "1.5.0"
ndarray-npy = { version = "0.8.1", optional = true }
num-traits = "0.2.18"
lazy_static = "1.4.0"
medians = "3.0.6"
csv = "1.3.0"
serde = { version = "1.0.197", features = ["derive"] }

[features]
# cli = [ "clap" ] # TODO make feature
Expand All @@ -39,6 +40,7 @@ big-position = []

[profile.release]
opt-level = 3
lto = true

[profile.dev]
opt-level = 1
Expand All @@ -54,3 +56,7 @@ criterion = { version = "0.5.1", features = ["html_reports"] }
name = "bedtools_comparison"
harness = false

[[bench]]
name = "io"
harness = false

77 changes: 36 additions & 41 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ let results_gr = left_gr
// Find overlaps
.left_overlaps(&right_gr)?
// Summarize overlap data
.map_over_joins(mean_score)?;
.map_joins(mean_score)?;
```

where `mean_score()` is:
Expand Down Expand Up @@ -84,54 +84,49 @@ server, here are two runs with 100,000 ranges per operation, and n = 100 samples
# run 1
command bedtools time granges time granges speedup (%)
------------ --------------- -------------- ---------------------
map_multiple 293.41 s 129.99 s 55.6963
map_max 131.97 s 111.39 s 15.5938
adjust 127.36 s 59.25 s 53.4811
filter 113.70 s 101.01 s 11.1607
map_min 116.84 s 108.16 s 7.42583
flank 150.95 s 86.25 s 42.8602
map_mean 116.50 s 143.70 s -23.3524
map_sum 110.60 s 111.39 s -0.71377
windows 476.12 s 67.82 s 85.7555
map_median 154.51 s 104.37 s 32.4551
map_multiple 270.21 s 112.66 s 58.3073
map_max 105.46 s 84.03 s 20.3185
adjust 112.42 s 53.48 s 52.4269
filter 114.23 s 77.96 s 31.7512
map_min 116.22 s 78.93 s 32.0839
flank 162.77 s 80.67 s 50.4383
map_mean 107.05 s 81.89 s 23.5073
map_sum 108.43 s 93.35 s 13.9083
windows 408.03 s 72.58 s 82.2121
map_median 108.57 s 87.32 s 19.5731
# run 2
command bedtools time granges time granges speedup (%)
------------ --------------- -------------- ---------------------
map_multiple 348.14 s 124.29 s 64.2979
map_max 126.27 s 107.05 s 15.2267
adjust 126.51 s 59.23 s 53.1802
filter 114.66 s 97.28 s 15.1542
map_min 118.61 s 113.02 s 4.72037
flank 165.31 s 80.97 s 51.0211
map_mean 133.39 s 108.22 s 18.867
map_sum 120.28 s 128.68 s -6.98736
windows 476.22 s 90.89 s 80.9151
map_median 106.83 s 104.37 s 2.30808
map_multiple 293.24 s 103.66 s 64.6495
map_max 117.84 s 82.39 s 30.0855
adjust 110.09 s 51.63 s 53.0999
filter 120.36 s 67.79 s 43.6784
map_min 114.76 s 86.06 s 25.0081
flank 160.20 s 75.69 s 52.756
map_mean 116.97 s 85.12 s 27.2331
map_sum 114.39 s 85.96 s 24.8557
windows 418.87 s 65.13 s 84.4515
map_median 112.35 s 82.81 s 26.2995
```

The worse performance of `map_mean` in run 1, upon closer inspection, is
largely driven by [aberrant
replicates](https://github.com/vsbuffalo/granges/issues/2). (The server is
under heavy use by others, which adds variance to these benchmarks.) Note too
that unlike `bedtools`, `granges` *always* requires a genome file of chromosome
lengths for validation; this disk I/O adds to GRange's benchmark times.

Here is another benchmark with smaller test datasets (10,000 ranges) and many
more replicates:
Note however, there is a arithmetic scaling issue. Larger range datasets
(1,000,000) lead to map operations that are inefficient. This is almost surely
due to `data/operations.rs`. Here is an example benchmark:

```
command bedtools time granges time granges speedup (%)
------------ --------------- -------------- ---------------------
map_multiple 133.41 s 77.81 s 41.6736
adjust 60.23 s 40.06 s 33.4794
map_min 67.11 s 59.31 s 11.6281
map_mean 64.76 s 59.29 s 8.43342
map_max 67.04 s 59.30 s 11.5468
map_sum 66.89 s 60.01 s 10.2835
map_median 65.63 s 59.47 s 9.37911
flank 84.42 s 57.62 s 31.7415
filter 77.95 s 56.65 s 27.3318
windows 291.41 s 51.47 s 82.3359
map_multiple 26.89 min 688.60 s 57.3189
map_max 21.34 min 21.54 min -0.94889 *
adjust 21.35 min 499.54 s 60.9959
filter 27.01 min 20.38 min 24.5583
map_min 935.69 s 17.48 min -12.1008 *
flank 30.26 min 943.36 s 48.0353
map_mean 20.13 min 931.21 s 22.913
map_sum 17.99 min 18.44 min -2.5225 *
windows 507.84 s 84.79 s 83.304
map_median 16.90 min 17.81 min -5.36811 *
```

So median, sum, min, and max are slower.
66 changes: 66 additions & 0 deletions benches/io.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use criterion::{criterion_group, criterion_main, Criterion};
use csv::{self, ReaderBuilder};
use granges::io::parsers::Bed5Addition;
use granges::ranges::GenomicRangeRecord;
use granges::test_utilities::random_bed5file;
use granges::{prelude::*, Position};

#[derive(Debug, serde::Deserialize, PartialEq)]
struct Bed5Row {
seqname: String,
start: Position,
end: Position,
name: String,
score: f64,
}

const BED_LENGTH: usize = 1_000_000;

fn bench_io_shootout(c: &mut Criterion) {
// create the benchmark group
let mut group = c.benchmark_group("adjust");

// create the test data
let input_bedfile = random_bed5file(BED_LENGTH);

let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap();

// configure the sample size for the group
group.sample_size(10);

// Bed5Iterator
group.bench_function("bed5iterator", |b| {
b.iter(|| {
let iter = Bed5Iterator::new(input_bedfile.path()).unwrap();
let gr = GRanges::from_iter(iter, &genome).unwrap();
gr.len()
});
});

// CSV
group.bench_function("csv", |b| {
b.iter(|| {
let mut rdr = ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_path(input_bedfile.path())
.unwrap();

let iter = rdr.deserialize();
let mut gr: GRanges<VecRangesIndexed, Vec<Bed5Addition>> = GRanges::new_vec(&genome);

for result in iter {
let row: GenomicRangeRecord<Bed5Addition> = result.unwrap();
let data = Bed5Addition {
name: row.data.name,
score: row.data.score,
};
gr.push_range(&row.seqname, row.start, row.end, data)
.unwrap();
}
});
});
}

criterion_group!(benches, bench_io_shootout,);
criterion_main!(benches);
21 changes: 11 additions & 10 deletions examples/mean_score.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
use granges::{prelude::*, join::CombinedJoinDataLeftEmpty};
use granges::{join::CombinedJoinDataLeftEmpty, prelude::*};

// Our overlap data processing function.
pub fn mean_score(join_data: CombinedJoinDataLeftEmpty<Option<f64>>) -> f64 {
// Get the "right data" -- the BED5 scores.
let overlap_scores: Vec<f64> = join_data.right_data.into_iter()
let overlap_scores: Vec<f64> = join_data
.right_data
.into_iter()
// filter out missing values ('.' in BED)
.filter_map(|x| x).collect();
.filter_map(|x| x)
.collect();

// calculate the mean
let score_sum: f64 = overlap_scores.iter().sum();
score_sum / (overlap_scores.len() as f64)
}


fn try_main() -> Result<(), granges::error::GRangesError> {
// Mock sequence lengths (e.g. "genome" file)
let genome = seqlens!("chr1" => 100, "chr2" => 100);
Expand All @@ -30,18 +32,17 @@ fn try_main() -> Result<(), granges::error::GRangesError> {
// Convert to interval trees.
.into_coitrees()?
// Extract out just the score from the additional BED5 columns.
.map_data(|bed5_cols| {
bed5_cols.score
})?;
.map_data(|bed5_cols| bed5_cols.score)?;

// Compute overlaps and combine scores into mean.
let results_gr = left_gr
.left_overlaps(&right_gr)?
.map_over_joins(mean_score)?;
.map_joins(mean_score)?;

results_gr.to_tsv(None::<String>, &BED_TSV)?;
Ok(())
}

fn main() { try_main().unwrap(); }

fn main() {
try_main().unwrap();
}
2 changes: 1 addition & 1 deletion src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ pub fn granges_map(
let left_join_gr = left_gr.left_overlaps(&right_gr)?;

// Process all the overlaps.
let result_gr = left_join_gr.map_over_joins(|join_data| {
let result_gr = left_join_gr.map_joins(|join_data| {
// Get the "right data" -- the BED5 scores
let mut overlap_scores: Vec<f64> = join_data
.right_data
Expand Down
7 changes: 3 additions & 4 deletions src/data/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ pub fn median<F: Float + Sum>(numbers: &mut [F]) -> Option<F> {
}
let mid = numbers.len() / 2;
if numbers.len() % 2 == 0 {
numbers.select_nth_unstable_by(mid-1, |a, b| a.partial_cmp(b).unwrap());
let lower = numbers[mid-1];
numbers.select_nth_unstable_by(mid - 1, |a, b| a.partial_cmp(b).unwrap());
let lower = numbers[mid - 1];
numbers.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap());
let upper = numbers[mid];
Some((lower + upper) / F::from(2.0).unwrap())
Expand Down Expand Up @@ -93,7 +93,7 @@ impl Operation {

#[cfg(test)]
mod tests {
use super::*;
use super::*;

#[test]
fn test_median_empty() {
Expand All @@ -118,5 +118,4 @@ mod tests {
let mut numbers = vec![-3.0, -1.0, -2.0];
assert_eq!(median(&mut numbers), Some(-2.0));
}

}
3 changes: 3 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ pub enum GRangesError {
#[error("File reading error: {0}. Please check if the file exists and you have permission to read it.")]
IOError(#[from] std::io::Error),

#[error("File parsing error: {0}")]
TsvParsingError(#[from] csv::Error),

// File parsing related errors
#[error("Could not determine the file type based on its extension. Ensure the file has a standard genomic data extension (.bed, .gff, etc.).")]
CouldNotDetectRangesFiletype,
Expand Down
12 changes: 6 additions & 6 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ where
/// See [`CombinedJoinData`] and its convenience methods, which are designed
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -847,7 +847,7 @@ impl GRanges<VecRangesIndexed, JoinDataBothEmpty> {
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.

pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -885,7 +885,7 @@ where
/// See [`CombinedJoinDataRightEmpty`] and its convenience methods, which are designed
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -924,7 +924,7 @@ where
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.

pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -1470,7 +1470,7 @@ mod tests {
}

#[test]
fn test_map_over_joins() {
fn test_map_joins() {
let sl = seqlens!("chr1" => 50);
let windows: GRangesEmpty<VecRangesEmpty> =
GRangesEmpty::from_windows(&sl, 10, None, true).unwrap();
Expand All @@ -1487,7 +1487,7 @@ mod tests {
let joined_results = windows
.left_overlaps(&right_gr)
.unwrap()
.map_over_joins(|join_data| {
.map_joins(|join_data| {
let overlap_scores = join_data.right_data;
overlap_scores.iter().sum::<f64>()
})
Expand Down
4 changes: 2 additions & 2 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub mod tsv;

pub use file::{InputStream, OutputStream};
pub use parsers::{
Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
TsvRecordIterator,
bed::Bed3Iterator, bed::Bed5Iterator, bed::BedlikeIterator, tsv::TsvRecordIterator,
GenomicRangesFile, GenomicRangesParser,
};
pub use tsv::BED_TSV;
Loading

0 comments on commit f144ab4

Please sign in to comment.