Skip to content

Commit

Permalink
Criterion-based benchmarks of granged adjust vs bedtools slop
Browse files Browse the repository at this point in the history
 - Added criterion benchmarks against bedtools
 - New enum pattern for loading BED3 vs BED+
 - clippy/fmt issues resolved
  • Loading branch information
vsbuffalo committed Feb 17, 2024
1 parent 31b84db commit 3e5c09c
Show file tree
Hide file tree
Showing 14 changed files with 170 additions and 84 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ name = "granges"
path = "src/main/mod.rs"

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
tempfile = "3.10.0"


[[bench]]
name = "bedtools_comparison"
harness = false

70 changes: 70 additions & 0 deletions benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
//! Benchmarks comparisons against bedtools.
//!
//! For comparison accuracy, any benchmarks conducted here must also have
//! corresponding integration tests in `tests/bedtools_validation.rs`, to
//! ensure the output is the *exact* same.

use criterion::{criterion_group, criterion_main, Criterion};
use granges::{commands::granges_random_bed, test_utilities::granges_binary_path};
use std::process::Command;
use tempfile::NamedTempFile;

/// Create a random BED3 file based on the hg38 sequence lengths, and write to disk.
pub fn random_bedfile() -> NamedTempFile {
let temp_file = NamedTempFile::new().expect("Failed to create temp file");
let random_bedfile_path = temp_file.path().to_path_buf();
granges_random_bed(
"tests_data/hg38_seqlens.tsv",
100_000,
Some(&random_bedfile_path),
true,
)
.expect("could not generate random BED file");
temp_file
}

fn bench_range_adjustment(c: &mut Criterion) {
// create the benchmark group
let mut group = c.benchmark_group("slop vs adjust");

// create the test data
let input_bedfile = random_bedfile();

// configure the sample size for the group
// group.sample_size(10);

// bedtools slop
group.bench_function("bedtools_slop", |b| {
b.iter(|| {
let bedtools_output = Command::new("bedtools")
.arg("slop")
.arg("-g")
.arg("tests_data/hg38_seqlens.tsv")
.arg("-b")
.arg("10")
.arg("-i")
.arg(input_bedfile.path())
.output()
.expect("bedtools slop failed");
assert!(bedtools_output.status.success());
});
});

group.bench_function("granges_adjust", |b| {
b.iter(|| {
let granges_output = Command::new(granges_binary_path())
.arg("adjust")
.arg("--seqlens")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--both")
.arg("10")
.arg("--sort")
.arg(input_bedfile.path())
.output()
.expect("granges adjust failed");
assert!(granges_output.status.success());
});
});
}
criterion_group!(benches, bench_range_adjustment);
criterion_main!(benches);
26 changes: 20 additions & 6 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
use std::path::PathBuf;

use crate::{prelude::*, PositionOffset, reporting::{CommandOutput, Report}, io::OutputFile, ranges::operations::adjust_range, test_utilities::random_granges};

use crate::{
io::OutputFile,
prelude::*,
ranges::operations::adjust_range,
reporting::{CommandOutput, Report},
test_utilities::random_granges,
PositionOffset,
};

/// Adjust the genomic ranges in a bedfile by some specified amount.
// NOTE: we don't do build the full GRanges objects here, for efficiency.
Expand Down Expand Up @@ -57,8 +63,17 @@ pub fn granges_adjust(
// we need to sort, so we build up the appropriate type of GRanges
// object, depending on if we need to hold data or not.
match gr {
GRangesVariant::Empty(ref mut obj) => obj.push_range_empty(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end)?,
GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end, range_adjusted.data)?,
GRangesVariant::Empty(ref mut obj) => obj.push_range_empty(
&range_adjusted.seqname,
range_adjusted.start,
range_adjusted.end,
)?,
GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data(
&range_adjusted.seqname,
range_adjusted.start,
range_adjusted.end,
range_adjusted.data,
)?,
}
}
} else {
Expand All @@ -79,7 +94,6 @@ pub fn granges_adjust(
GRangesVariant::Empty(obj) => obj.sort().to_bed3(output)?,
GRangesVariant::Indexed(obj) => obj.sort().to_tsv(output)?,
}

}
Ok(CommandOutput::new((), report))
}
Expand All @@ -90,7 +104,7 @@ pub fn granges_random_bed(
num: u32,
output: Option<impl Into<PathBuf>>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

Expand Down
5 changes: 0 additions & 5 deletions src/data/vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

use crate::traits::IndexedDataContainer;


/// Trait methods for the commonly-used `Vec<U>` data container.
///
/// Note that the associated `Item` type is always a *reference* to the data elements.
Expand All @@ -29,7 +28,3 @@ where
Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
}
}




18 changes: 7 additions & 11 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
RangeEmpty, RangeIndexed, RangeRecord,
},
traits::{RangeContainer, RangesIterable, TsvSerialize, GenericRange, IndexedDataContainer},
traits::{GenericRange, IndexedDataContainer, RangeContainer, RangesIterable, TsvSerialize},
Position,
};

Expand Down Expand Up @@ -71,11 +71,7 @@ impl<R: GenericRange, T> GRanges<VecRanges<R>, T> {

/// Consume this [`GRanges`] object and sort the ranges.
pub fn sort(mut self) -> Self {
self.ranges
.values_mut()
.for_each(|ranges| {
ranges.sort()
});
self.ranges.values_mut().for_each(|ranges| ranges.sort());
self
}
}
Expand Down Expand Up @@ -106,10 +102,11 @@ impl<U> GRanges<VecRangesIndexed, Vec<U>> {
}
}

impl<'a, T> GRanges<VecRanges<RangeIndexed>, T>
where T: IndexedDataContainer<'a>,
T: TsvSerialize,
<T as IndexedDataContainer<'a>>::Item: TsvSerialize
impl<'a, T> GRanges<VecRanges<RangeIndexed>, T>
where
T: IndexedDataContainer<'a>,
T: TsvSerialize,
<T as IndexedDataContainer<'a>>::Item: TsvSerialize,
{
///
pub fn to_tsv(&'a self, output: Option<impl Into<PathBuf>>) -> Result<(), GRangesError> {
Expand All @@ -126,7 +123,6 @@ where T: IndexedDataContainer<'a>,
}
Ok(())
}

}

impl GRanges<VecRangesEmpty, ()> {
Expand Down
4 changes: 1 addition & 3 deletions src/io/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ impl InputFile {
}

/// Collects comment lines and/or a line at the start of the file.
pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>)
-> io::Result<bool> {
pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result<bool> {
let mut buf_reader = self.reader()?;
let mut comments = Vec::new();
let mut line = String::new();
Expand Down Expand Up @@ -123,7 +122,6 @@ impl InputFile {
Ok(self.skip_lines > 0)
}


/// Detect the number of columns *from the first line*, according to some delimiter.
/// This is not robust against ragged delimited data formats.
pub fn detect_columns(&mut self, delim: &str) -> Result<usize, GRangesError> {
Expand Down
2 changes: 2 additions & 0 deletions src/join.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![allow(clippy::all)]

use std::rc::Rc;

use crate::Position;
Expand Down
6 changes: 3 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

pub use indexmap;

pub mod data;
pub mod error;
pub mod granges;
pub mod io;
pub mod data;
pub mod iterators;
pub mod join;
pub mod ranges;
Expand All @@ -30,8 +30,8 @@ pub mod prelude {

pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
pub use crate::traits::{
IndexedDataContainer,
GeneralRangeRecordIterator, GenericRange, RangesIntoIterable, RangesIterable, TsvSerialize,
GeneralRangeRecordIterator, GenericRange, IndexedDataContainer, RangesIntoIterable,
RangesIterable, TsvSerialize,
};

pub use crate::seqlens;
Expand Down
6 changes: 2 additions & 4 deletions src/main/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::path::PathBuf;

use clap::{Parser, Subcommand};
use granges::{prelude::GRangesError, PositionOffset, commands::{granges_adjust}};
use granges::{commands::granges_adjust, prelude::GRangesError, PositionOffset};

#[cfg(feature = "dev-commands")]
use granges::commands::granges_random_bed;
Expand Down Expand Up @@ -45,7 +45,6 @@ enum Commands {
/// sort the ranges after adjusting their start and end positions
#[arg(long)]
sort: bool,

},

#[cfg(feature = "dev-commands")]
Expand All @@ -59,7 +58,7 @@ enum Commands {
/// an optional output file (standard output will be used if not specified)
#[arg(long)]
output: Option<PathBuf>,
/// sort the ranges
/// sort the ranges
#[arg(long)]
sort: bool,
},
Expand All @@ -83,7 +82,6 @@ fn run() -> Result<(), GRangesError> {
sort,
}) => granges_random_bed(seqlens, *num, output.as_ref(), *sort),
None => {

println!("{}\n", INFO);
std::process::exit(1);
}
Expand Down
Loading

0 comments on commit 3e5c09c

Please sign in to comment.