Skip to content

Commit

Permalink
granges filter is working, matches bedtools intersect output, and is …
Browse files Browse the repository at this point in the history
…40% faster.

 - Working version of granges filter subcommand, for filtering based
   on overlaps.
 - New integration test of granges filter vs bedtools intersect.
 - New benchmarks: granges filter vs bedtools intersect. We're fast!
   e.g. random one: granges: 6.28ms, bedtools: 8.80ms - about 40% faster.
 - Renamed `to_coitrees()` to `into_coitrees()`.
 - Fixed issue with the `AsGRangesRef` not being used when it should have
   been.
 - Bunch of new `filter_overlaps()` tests.
 - Fixed off-by-one bug with `From<coitrees::Interval>` to our range types.
 - Bunch of new tests for converting between range types to check by
   off-by-one errors.
 - Reshuffled `temp_bedfile()` and `random_bedfile()` into `test_utilities.rs`.
 - Checked in new example BED and BED-like files.
  • Loading branch information
vsbuffalo committed Feb 20, 2024
1 parent 7a014af commit 3935be9
Show file tree
Hide file tree
Showing 18 changed files with 435 additions and 219 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ indexmap = "2.2.3"
ndarray = "0.15.6"
noodles = { version = "0.63.0", features = ["core", "bed"] }
rand = "0.8.5"
tempfile = "3.10.0"
thiserror = "1.0.57"

[features]
Expand All @@ -38,7 +39,6 @@ path = "src/main/mod.rs"

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
tempfile = "3.10.0"

[[bench]]
name = "bedtools_comparison"
Expand Down
69 changes: 50 additions & 19 deletions benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,17 @@
//! ensure the output is the *exact* same.

use criterion::{criterion_group, criterion_main, Criterion};
use granges::{commands::granges_random_bed, test_utilities::granges_binary_path};
use granges::test_utilities::{granges_binary_path, random_bedfile};
use std::process::Command;
use tempfile::NamedTempFile;

const BED_LENGTH: u32 = 10_000;

/// Create a random BED3 file based on the hg38 sequence lengths, and write to disk.
pub fn random_bedfile() -> NamedTempFile {
let temp_file = NamedTempFile::new().expect("Failed to create temp file");
let random_bedfile_path = temp_file.path().to_path_buf();
granges_random_bed(
"tests_data/hg38_seqlens.tsv",
BED_LENGTH,
Some(&random_bedfile_path),
true,
)
.expect("could not generate random BED file");
temp_file
}
const BED_LENGTH: usize = 10_000;

fn bench_range_adjustment(c: &mut Criterion) {
// create the benchmark group
let mut group = c.benchmark_group("slop vs adjust");

// create the test data
let input_bedfile = random_bedfile();
let input_bedfile = random_bedfile(BED_LENGTH);

// configure the sample size for the group
// group.sample_size(10);
Expand Down Expand Up @@ -68,5 +53,51 @@ fn bench_range_adjustment(c: &mut Criterion) {
});
});
}
criterion_group!(benches, bench_range_adjustment);

fn bench_filter_adjustment(c: &mut Criterion) {
// create the benchmark group
let mut group = c.benchmark_group("intersect vs filter");

// create the test data
let random_bedfile_left_tempfile = random_bedfile(BED_LENGTH);
let random_bedfile_right_tempfile = random_bedfile(BED_LENGTH);
let random_bedfile_left = random_bedfile_left_tempfile.path();
let random_bedfile_right = random_bedfile_right_tempfile.path();

// configure the sample size for the group
// group.sample_size(10);
group.bench_function("bedtools_intersect", |b| {
b.iter(|| {
let bedtools_output = Command::new("bedtools")
.arg("intersect")
.arg("-a")
.arg(&random_bedfile_left)
.arg("-b")
.arg(&random_bedfile_right)
.arg("-wa")
.arg("-u")
.output()
.expect("bedtools intersect failed");
assert!(bedtools_output.status.success());
});
});

group.bench_function("granges_filter", |b| {
b.iter(|| {
let granges_output = Command::new(granges_binary_path())
.arg("filter")
.arg("--seqlens")
.arg("tests_data/hg38_seqlens.tsv")
.arg("--left")
.arg(&random_bedfile_left)
.arg("--right")
.arg(&random_bedfile_right)
.output()
.expect("granges adjust failed");
assert!(granges_output.status.success());
});
});
}

criterion_group!(benches, bench_filter_adjustment, bench_range_adjustment);
criterion_main!(benches);
65 changes: 41 additions & 24 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::path::PathBuf;

use crate::{
io::{OutputFile, parsers::GenomicRangesParser},
io::{parsers::GenomicRangesParser, OutputFile},
prelude::*,
ranges::operations::adjust_range,
reporting::{CommandOutput, Report},
Expand Down Expand Up @@ -56,34 +56,33 @@ pub fn granges_adjust(

if skipped_ranges > 0 {
report.add_issue(format!(
"{} ranges were removed because their widths after adjustment were ≤ 0",
skipped_ranges
))
"{} ranges were removed because their widths after adjustment were ≤ 0",
skipped_ranges
))
}
}
} else {
// If we do need to sort, build up a GRanges variant and adjust ranges through
// the GRanges interface. Note we need to detect and build a specific iterator
// If we do need to sort, build up a GRanges variant and adjust ranges through
// the GRanges interface. Note we need to detect and build a specific iterator
// for the filetype.

let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;
match ranges_iter {
GenomicRangesParser::Bed3(iter) => {
let gr = GRangesEmpty::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
},
}
GenomicRangesParser::Bedlike(iter) => {
// Note the call to try_unwrap_data() here: this is because
// we know that the records *do* have data. Unwrapping the Option<String>
// values means that writing to TSV doesn't have to deal with this (which
// values means that writing to TSV doesn't have to deal with this (which
// always creates headaches).
let gr = GRanges::from_iter(iter.try_unwrap_data()?, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
},
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
},

}
}
}
Ok(CommandOutput::new((), report))
Expand All @@ -102,47 +101,65 @@ pub fn granges_filter(
let left_iter = GenomicRangesFile::parsing_iterator(left_path)?;
let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;

let output_stream = output.as_ref().map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
});
let mut writer = output_stream.writer()?;

// for reporting stuff to the user
let mut report = Report::new();

match (left_iter, right_iter) {
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => {
let left_gr = GRangesEmpty::from_iter(left, &genome)?;
let right_gr = GRangesEmpty::from_iter(right, &genome)?;

let left_gr = GRangesEmpty::from_iter(left, &genome)?; let right_gr =
GRangesEmpty::from_iter(right, &genome)?;

let right_gr = right_gr.to_coitrees()?;
let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
let mut intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => {
let left_gr = GRangesEmpty::from_iter(left, &genome)?;
let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?;

let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => {
let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?;
let right_gr = GRangesEmpty::from_iter(right, &genome)?;

let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => {
let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?;
let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?;

let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
_ => Ok(CommandOutput::new((), report)),
_ => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
}
}

/// Generate a random BED-like file with genomic ranges.
pub fn granges_random_bed(
seqlens: impl Into<PathBuf>,
num: u32,
num: usize,
output: Option<impl Into<PathBuf>>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

Expand Down
Loading

0 comments on commit 3935be9

Please sign in to comment.