Sequences::region_map_into_granges(), GRanges::map_into_array*() …

…added, etc. - Renamed `region_apply` in `region_map` to be more consistent with Rust's naming. - `retain_seqnames`, etc for `TsvRecordIterator`. - New `Sequences::region_map_into_granges()`, which applies a function to regions of a sequence, and puts the results in a new `GRanges`. - New `Sequences` test cases corresponding to `granges_test_case_01()`, and lots of new `Sequence`-related tests. - Added `map_into_array1()` and `map_into_array2()` for the --feature=ndarray, with tests. - `take_ranges()` and `take_both()` added. - New `GRanges::midpoints()` method, with tests. - New `GRanges::data_indices()` method, to build a `GenomeMap` out of the data indices. - New `GRanges::data_by_seqnames()` which builds a `GenomeMap` of `Vec<U>` of the data, by sequence name. `GRanges::data_refs_by_seqnames()` also added, for references. - `GRanges::into_vecranegs()` added, with `From` method for `COITreesEmpty` to `VecRangesEmpty` added since it's needed. - Cleaned lifetime out of `IndexedDataContainer`, brought down to assoc. type level. Cleans up things a lot! - `into_array1()` and `into_array2()` and tests. - Remove section on readme on slow benchmarks -- this is now GH issue #4.
vsbuffalo · Feb 28, 2024 · de235dc · de235dc
1 parent f144ab4
commit de235dc
Show file tree

Hide file tree

Showing 26 changed files with 785 additions and 202 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "granges"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 license = "MIT"
 authors = ["Vince Buffalo <[email protected]>"]
@@ -31,9 +31,8 @@ csv = "1.3.0"
 serde = { version = "1.0.197", features = ["derive"] }
 
 [features]
-# cli = [ "clap" ] # TODO make feature
 dev-commands = [ ]
-test_bigranges = [] # NOTE: not used yet, for tests on large files
+bench-big = []
 polars = ["dep:polars"]
 ndarray = ["dep:ndarray", "dep:ndarray-npy"]
 big-position = []

diff --git a/README.md b/README.md
@@ -110,23 +110,3 @@ windows 418.87 s 65.13 s 84.4515
 map_median 112.35 s 82.81 s 26.2995
 ```
 
-Note however, there is a arithmetic scaling issue. Larger range datasets
-(1,000,000) lead to map operations that are inefficient. This is almost surely 
-due to `data/operations.rs`. Here is an example benchmark:
-
-```
-command bedtools time granges time granges speedup (%)
------------- --------------- -------------- ---------------------
-map_multiple 26.89 min 688.60 s 57.3189
-map_max 21.34 min 21.54 min -0.94889 *
-adjust 21.35 min 499.54 s 60.9959
-filter 27.01 min 20.38 min 24.5583
-map_min 935.69 s 17.48 min -12.1008 * 
-flank 30.26 min 943.36 s 48.0353
-map_mean 20.13 min 931.21 s 22.913
-map_sum 17.99 min 18.44 min -2.5225 *
-windows 507.84 s 84.79 s 83.304
-map_median 16.90 min 17.81 min -5.36811 *
-```
-
-So median, sum, min, and max are slower.
diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs
@@ -13,7 +13,10 @@ use std::{
  process::{Command, Stdio},
 };
 
+#[cfg(not(feature = "bench-big"))]
 const BED_LENGTH: usize = 100_000;
+#[cfg(feature = "bench-big")]
+const BED_LENGTH: usize = 1_000_000;
 
 fn bench_range_adjustment(c: &mut Criterion) {
  // create the benchmark group
@@ -229,7 +232,7 @@ fn bench_map(c: &mut Criterion) {
  let mut group = c.benchmark_group(format!("map_{}", operation).to_string());
 
  // configure the sample size for the group
- group.sample_size(10);
+ group.sample_size(100);
  group.bench_function("bedtools_map", |b| {
  b.iter(|| {
  let bedtools_output_file = File::create(&bedtools_path).unwrap();

diff --git a/examples/mean_score.rs b/examples/mean_score.rs
@@ -35,11 +35,9 @@ fn try_main() -> Result<(), granges::error::GRangesError> {
  .map_data(|bed5_cols| bed5_cols.score)?;
 
  // Compute overlaps and combine scores into mean.
- let results_gr = left_gr
- .left_overlaps(&right_gr)?
- .map_joins(mean_score)?;
+ let results_gr = left_gr.left_overlaps(&right_gr)?.map_joins(mean_score)?;
 
- results_gr.to_tsv(None::<String>, &BED_TSV)?;
+ results_gr.write_to_tsv(None::<String>, &BED_TSV)?;
  Ok(())
 }
 

diff --git a/src/commands.rs b/src/commands.rs
@@ -105,19 +105,22 @@ pub fn granges_adjust(
  match ranges_iter {
  GenomicRangesParser::Bed3(iter) => {
  let gr = GRangesEmpty::from_iter(iter, &genome)?;
- gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
+ gr.adjust_ranges(-both, both)
+ .write_to_tsv(output, &BED_TSV)?
  }
  GenomicRangesParser::Bed5(iter) => {
  let gr = GRanges::from_iter(iter, &genome)?;
- gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
+ gr.adjust_ranges(-both, both)
+ .write_to_tsv(output, &BED_TSV)?
  }
  GenomicRangesParser::Bedlike(iter) => {
  // Note the call to try_unwrap_data() here: this is because
  // we know that the records *do* have data. Unwrapping the Option<String>
  // values means that writing to TSV doesn't have to deal with this (which
  // always creates headaches).
  let gr = GRanges::from_iter(iter.try_unwrap_data(), &genome)?;
- gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
+ gr.adjust_ranges(-both, both)
+ .write_to_tsv(output, &BED_TSV)?
  }
  GenomicRangesParser::Unsupported => {
  return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
@@ -181,7 +184,7 @@ pub fn granges_filter(
  let right_gr = right_gr.into_coitrees()?;
 
  let semijoin = left_gr.filter_overlaps(&right_gr)?;
- semijoin.to_tsv(output, &BED_TSV)?;
+ semijoin.write_to_tsv(output, &BED_TSV)?;
 
  Ok(CommandOutput::new((), report))
  }
@@ -203,7 +206,7 @@ pub fn granges_filter(
  let right_gr = right_gr.into_coitrees()?;
 
  let semijoin = left_gr.filter_overlaps(&right_gr)?;
- semijoin.to_tsv(output, &BED_TSV)?;
+ semijoin.write_to_tsv(output, &BED_TSV)?;
 
  Ok(CommandOutput::new((), report))
  }
@@ -223,7 +226,7 @@ pub fn granges_filter(
  let right_gr = right_gr.into_coitrees()?;
 
  let semijoin = left_gr.filter_overlaps(&right_gr)?;
- semijoin.to_tsv(output, &BED_TSV)?;
+ semijoin.write_to_tsv(output, &BED_TSV)?;
 
  Ok(CommandOutput::new((), report))
  }
@@ -246,7 +249,7 @@ pub fn granges_filter(
  let right_gr = right_gr.into_coitrees()?;
 
  let intersection = left_gr.filter_overlaps(&right_gr)?;
- intersection.to_tsv(output, &BED_TSV)?;
+ intersection.write_to_tsv(output, &BED_TSV)?;
 
  Ok(CommandOutput::new((), report))
  }
@@ -303,7 +306,8 @@ pub fn granges_flank(
  } else {
  GRangesEmpty::from_iter(iter, &genome)?
  };
- gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)?
+ gr.flanking_ranges(left, right)?
+ .write_to_tsv(output, &BED_TSV)?
  }
  GenomicRangesParser::Bed5(_iter) => {
  unimplemented!()
@@ -314,7 +318,8 @@ pub fn granges_flank(
  } else {
  GRanges::from_iter(iter.try_unwrap_data(), &genome)?
  };
- gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)?
+ gr.flanking_ranges(left, right)?
+ .write_to_tsv(output, &BED_TSV)?
  }
  GenomicRangesParser::Unsupported => {
  return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
@@ -469,7 +474,7 @@ pub fn granges_map(
  .collect::<Vec<DatumType>>()
  })?;
 
- result_gr.to_tsv(output, &BED_TSV)?;
+ result_gr.write_to_tsv(output, &BED_TSV)?;
 
  Ok(CommandOutput::new((), report))
 }
@@ -483,7 +488,7 @@ pub fn granges_windows(
  output: Option<impl Into<PathBuf>>,
 ) -> Result<CommandOutput<()>, GRangesError> {
  let genome = read_seqlens(seqlens)?;
- GRangesEmpty::from_windows(&genome, width, step, chop)?.to_tsv(output, &BED_TSV)?;
+ GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?;
  let report = Report::new();
  Ok(CommandOutput::new((), report))
 }
@@ -504,13 +509,13 @@ pub fn granges_random_bed(
  if sort {
  gr = gr.sort()
  }
- gr.to_tsv(output, &BED_TSV)?;
+ gr.write_to_tsv(output, &BED_TSV)?;
  } else {
  let mut gr = random_granges(&genome, num)?;
  if sort {
  gr = gr.sort();
  }
- gr.to_tsv(output, &BED_TSV)?;
+ gr.write_to_tsv(output, &BED_TSV)?;
  };
 
  let report = Report::new();

diff --git a/src/data/mod.rs b/src/data/mod.rs
@@ -3,9 +3,14 @@
 
 use crate::traits::{DataContainer, IntoDatumType};
 
+#[cfg(feature = "ndarray")]
+pub mod ndarray;
 pub mod operations;
 pub mod vec;
 
+impl<U> DataContainer for Vec<U> {}
+impl DataContainer for () {}
+
 /// These are core supported data types stored in an `enum`, to
 /// unify the types that come out of standard operations like
 /// `select()`.
@@ -67,6 +72,3 @@ impl<T: IntoDatumType> From<T> for DatumType {
  item.into_data_type()
  }
 }
-
-impl<U> DataContainer for Vec<U> {}
-impl DataContainer for () {}