Skip to content

Commit

Permalink
Sequences::region_map_into_granges(), GRanges::map_into_array*()
Browse files Browse the repository at this point in the history
…added, etc.

 - Renamed `region_apply` in `region_map` to be more consistent with Rust's naming.
 - `retain_seqnames`, etc for `TsvRecordIterator`.
 - New `Sequences::region_map_into_granges()`, which applies a function to
   regions of a sequence, and puts the results in a new `GRanges`.
 - New `Sequences` test cases corresponding to `granges_test_case_01()`,
   and lots of new `Sequence`-related tests.
 - Added `map_into_array1()` and `map_into_array2()` for the
   --feature=ndarray, with tests.
 - `take_ranges()` and `take_both()` added.
 - New `GRanges::midpoints()` method, with tests.
 - New `GRanges::data_indices()` method, to build a `GenomeMap` out of
   the data indices.
 - New `GRanges::data_by_seqnames()` which builds a `GenomeMap` of
   `Vec<U>` of the data, by sequence name. `GRanges::data_refs_by_seqnames()`
   also added, for references.
 - `GRanges::into_vecranegs()` added, with `From` method
   for `COITreesEmpty` to `VecRangesEmpty` added since it's needed.
 - Cleaned lifetime out of `IndexedDataContainer`, brought down
   to assoc. type level. Cleans up things a lot!
 - `into_array1()` and `into_array2()` and tests.
 - Remove section on readme on slow benchmarks -- this is now
   GH issue #4.
  • Loading branch information
vsbuffalo committed Feb 28, 2024
1 parent f144ab4 commit de235dc
Show file tree
Hide file tree
Showing 26 changed files with 785 additions and 202 deletions.
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "granges"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
license = "MIT"
authors = ["Vince Buffalo <[email protected]>"]
Expand Down Expand Up @@ -31,9 +31,8 @@ csv = "1.3.0"
serde = { version = "1.0.197", features = ["derive"] }

[features]
# cli = [ "clap" ] # TODO make feature
dev-commands = [ ]
test_bigranges = [] # NOTE: not used yet, for tests on large files
bench-big = []
polars = ["dep:polars"]
ndarray = ["dep:ndarray", "dep:ndarray-npy"]
big-position = []
Expand Down
20 changes: 0 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,23 +110,3 @@ windows 418.87 s 65.13 s 84.4515
map_median 112.35 s 82.81 s 26.2995
```

Note however, there is a arithmetic scaling issue. Larger range datasets
(1,000,000) lead to map operations that are inefficient. This is almost surely
due to `data/operations.rs`. Here is an example benchmark:

```
command bedtools time granges time granges speedup (%)
------------ --------------- -------------- ---------------------
map_multiple 26.89 min 688.60 s 57.3189
map_max 21.34 min 21.54 min -0.94889 *
adjust 21.35 min 499.54 s 60.9959
filter 27.01 min 20.38 min 24.5583
map_min 935.69 s 17.48 min -12.1008 *
flank 30.26 min 943.36 s 48.0353
map_mean 20.13 min 931.21 s 22.913
map_sum 17.99 min 18.44 min -2.5225 *
windows 507.84 s 84.79 s 83.304
map_median 16.90 min 17.81 min -5.36811 *
```

So median, sum, min, and max are slower.
5 changes: 4 additions & 1 deletion benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ use std::{
process::{Command, Stdio},
};

#[cfg(not(feature = "bench-big"))]
const BED_LENGTH: usize = 100_000;
#[cfg(feature = "bench-big")]
const BED_LENGTH: usize = 1_000_000;

fn bench_range_adjustment(c: &mut Criterion) {
// create the benchmark group
Expand Down Expand Up @@ -229,7 +232,7 @@ fn bench_map(c: &mut Criterion) {
let mut group = c.benchmark_group(format!("map_{}", operation).to_string());

// configure the sample size for the group
group.sample_size(10);
group.sample_size(100);
group.bench_function("bedtools_map", |b| {
b.iter(|| {
let bedtools_output_file = File::create(&bedtools_path).unwrap();
Expand Down
6 changes: 2 additions & 4 deletions examples/mean_score.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,9 @@ fn try_main() -> Result<(), granges::error::GRangesError> {
.map_data(|bed5_cols| bed5_cols.score)?;

// Compute overlaps and combine scores into mean.
let results_gr = left_gr
.left_overlaps(&right_gr)?
.map_joins(mean_score)?;
let results_gr = left_gr.left_overlaps(&right_gr)?.map_joins(mean_score)?;

results_gr.to_tsv(None::<String>, &BED_TSV)?;
results_gr.write_to_tsv(None::<String>, &BED_TSV)?;
Ok(())
}

Expand Down
31 changes: 18 additions & 13 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,19 +105,22 @@ pub fn granges_adjust(
match ranges_iter {
GenomicRangesParser::Bed3(iter) => {
let gr = GRangesEmpty::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
gr.adjust_ranges(-both, both)
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bed5(iter) => {
let gr = GRanges::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
gr.adjust_ranges(-both, both)
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bedlike(iter) => {
// Note the call to try_unwrap_data() here: this is because
// we know that the records *do* have data. Unwrapping the Option<String>
// values means that writing to TSV doesn't have to deal with this (which
// always creates headaches).
let gr = GRanges::from_iter(iter.try_unwrap_data(), &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
gr.adjust_ranges(-both, both)
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
Expand Down Expand Up @@ -181,7 +184,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.to_tsv(output, &BED_TSV)?;
semijoin.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -203,7 +206,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.to_tsv(output, &BED_TSV)?;
semijoin.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -223,7 +226,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.to_tsv(output, &BED_TSV)?;
semijoin.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -246,7 +249,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output, &BED_TSV)?;
intersection.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand Down Expand Up @@ -303,7 +306,8 @@ pub fn granges_flank(
} else {
GRangesEmpty::from_iter(iter, &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)?
gr.flanking_ranges(left, right)?
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bed5(_iter) => {
unimplemented!()
Expand All @@ -314,7 +318,8 @@ pub fn granges_flank(
} else {
GRanges::from_iter(iter.try_unwrap_data(), &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)?
gr.flanking_ranges(left, right)?
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
Expand Down Expand Up @@ -469,7 +474,7 @@ pub fn granges_map(
.collect::<Vec<DatumType>>()
})?;

result_gr.to_tsv(output, &BED_TSV)?;
result_gr.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -483,7 +488,7 @@ pub fn granges_windows(
output: Option<impl Into<PathBuf>>,
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
GRangesEmpty::from_windows(&genome, width, step, chop)?.to_tsv(output, &BED_TSV)?;
GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?;
let report = Report::new();
Ok(CommandOutput::new((), report))
}
Expand All @@ -504,13 +509,13 @@ pub fn granges_random_bed(
if sort {
gr = gr.sort()
}
gr.to_tsv(output, &BED_TSV)?;
gr.write_to_tsv(output, &BED_TSV)?;
} else {
let mut gr = random_granges(&genome, num)?;
if sort {
gr = gr.sort();
}
gr.to_tsv(output, &BED_TSV)?;
gr.write_to_tsv(output, &BED_TSV)?;
};

let report = Report::new();
Expand Down
8 changes: 5 additions & 3 deletions src/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@

use crate::traits::{DataContainer, IntoDatumType};

#[cfg(feature = "ndarray")]
pub mod ndarray;
pub mod operations;
pub mod vec;

impl<U> DataContainer for Vec<U> {}
impl DataContainer for () {}

/// These are core supported data types stored in an `enum`, to
/// unify the types that come out of standard operations like
/// `select()`.
Expand Down Expand Up @@ -67,6 +72,3 @@ impl<T: IntoDatumType> From<T> for DatumType {
item.into_data_type()
}
}

impl<U> DataContainer for Vec<U> {}
impl DataContainer for () {}
Loading

0 comments on commit de235dc

Please sign in to comment.