Skip to content

Commit

Permalink
New merging iterators, cleaned up BED parsing, new merge command.
Browse files Browse the repository at this point in the history
 - Merging iterator prototypes added. These are streaming iterators
   of sorted input that are merged based on distance or overlap.
 - Much cleaner BED variant detection, and `Bed4Iterator` method.
 - Entire organization of parsing modules.
 - New detection methods that try to use serde deserialization.
 - The new `merge` command is structured differently than other
   commands as part of a new cleaner way to build up subcommands
   in code.
 - New validation and benchmarks of `granges merge`.
 - New `granges_test_case_03()` and new test data.
 - New test utilities, e.g. `validate_bedfloats()` for validating
   agains bedtools map, etc functions.
 - Bedlike now based on csv + serde.
 - New `granges filter-chrom` command.
  • Loading branch information
vsbuffalo committed Mar 4, 2024
1 parent 0863d3f commit ba880b6
Show file tree
Hide file tree
Showing 31 changed files with 2,230 additions and 993 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "granges"
version = "0.2.0"
version = "0.2.1"
edition = "2021"
license = "MIT"
authors = ["Vince Buffalo <[email protected]>"]
Expand All @@ -14,7 +14,7 @@ description = "A Rust library and command line tool for genomic range operations
# clap = { version = "4.4.18", features = ["derive"], optional = true }
clap = { version = "4.4.18", features = ["derive"] }
coitrees = { version = "0.4.0", features = ["nosimd"] }
flate2 = "1.0.28"
flate2 = { version = "1.0.28", features = ["zlib-ng-compat"] }
genomap = "0.2.6"
indexmap = "2.2.3"
ndarray = { version = "0.15.6", optional = true}
Expand Down
43 changes: 43 additions & 0 deletions benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,51 @@ fn bench_map_all_operations(c: &mut Criterion) {
});
}

fn bench_merge_empty(c: &mut Criterion) {
// create the benchmark group
let mut group = c.benchmark_group("merge_empty");

// create the test data
let input_bedfile = random_bed3file(BED_LENGTH);

// the distance
let distance = 100;

// configure the sample size for the group
group.sample_size(10);

group.bench_function("bedtools_merge", |b| {
b.iter(|| {
let bedtools_output = Command::new("bedtools")
.arg("merge")
.arg("-i")
.arg(input_bedfile.path())
.arg("-d")
.arg(distance.to_string())
.output()
.expect("bedtools merge failed");
assert!(bedtools_output.status.success());
});
});

group.bench_function("granges_merge", |b| {
b.iter(|| {
let granges_output = Command::new(granges_binary_path())
.arg("merge")
.arg("--bedfile")
.arg(input_bedfile.path())
.arg("-d")
.arg(distance.to_string())
.output()
.expect("bedtools merge failed");
assert!(granges_output.status.success());
});
});
}

criterion_group!(
benches,
bench_merge_empty,
bench_filter_adjustment,
bench_range_adjustment,
bench_flank,
Expand Down
194 changes: 173 additions & 21 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
//! Command functions that implement each of the `granges` subcommands.
//!
// TODO: these functions should be methods of the input struct.

use std::{fs::File, io, path::PathBuf};

use clap::Parser;
use csv::WriterBuilder;
use std::{fs::File, io, path::PathBuf};

use crate::{
data::{operations::Operation, SerializableDatumType},
data::{operations::FloatOperation, SerializableDatumType},
io::{
parsers::{Bed5Iterator, GenomicRangesParser},
tsv::BED_TSV,
},
merging_iterators::{MergingEmptyResultIterator, MergingResultIterator},
prelude::*,
ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty},
reporting::{CommandOutput, Report},
Expand Down Expand Up @@ -111,6 +114,11 @@ pub fn granges_adjust(
gr.adjust_ranges(-both, both)
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bed4(iter) => {
let gr = GRanges::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both)
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bed5(iter) => {
let gr = GRanges::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both)
Expand All @@ -130,7 +138,7 @@ pub fn granges_adjust(
}
}
}
Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), Some(report)))
}

/// Filters genomic ranges based on overlaps with another set of ranges.
Expand Down Expand Up @@ -168,9 +176,6 @@ pub fn granges_filter(
let left_iter = GenomicRangesFile::parsing_iterator(left_path)?;
let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;

// for reporting stuff to the user
let report = Report::new();

match (left_iter, right_iter) {
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => {
let left_gr;
Expand All @@ -189,7 +194,7 @@ pub fn granges_filter(
let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => {
let left_gr;
Expand All @@ -211,7 +216,7 @@ pub fn granges_filter(
let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => {
let left_gr;
Expand All @@ -231,7 +236,7 @@ pub fn granges_filter(
let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => {
let left_gr;
Expand All @@ -254,7 +259,7 @@ pub fn granges_filter(
let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}
_ => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
}
Expand Down Expand Up @@ -297,8 +302,6 @@ pub fn granges_flank(
let seqnames: Vec<String> = genome.keys().cloned().collect();
let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;

let report = Report::new();

match mode {
// Note: this is kept for benchmarking, to see how costly building GRanges
// objects is versus using streaming.
Expand All @@ -312,6 +315,16 @@ pub fn granges_flank(
gr.flanking_ranges(left, right)?
.write_to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bed4(iter) => {
let gr = if skip_missing {
GRanges::from_iter(iter.retain_seqnames(&seqnames), &genome)?
} else {
GRanges::from_iter(iter, &genome)?
};
gr.flanking_ranges(left, right)?
.write_to_tsv(output, &BED_TSV)?
}

GenomicRangesParser::Bed5(_iter) => {
unimplemented!()
}
Expand Down Expand Up @@ -372,6 +385,9 @@ pub fn granges_flank(
}
}
}
GenomicRangesParser::Bed4(_iter) => {
unimplemented!()
}
GenomicRangesParser::Bed5(_iter) => {
unimplemented!()
}
Expand Down Expand Up @@ -412,7 +428,7 @@ pub fn granges_flank(
}
}
}
Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}

/// # Developer Notes
Expand All @@ -421,13 +437,12 @@ pub fn granges_map(
seqlens: impl Into<PathBuf>,
left_path: &PathBuf,
right_path: &PathBuf,
operations: Vec<Operation>,
operations: Vec<FloatOperation>,
output: Option<&PathBuf>,
skip_missing: bool,
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();
let report = Report::new();

let left_iter = Bed3Iterator::new(left_path)?;
let right_iter = Bed5Iterator::new(right_path)?;
Expand Down Expand Up @@ -487,7 +502,7 @@ pub fn granges_map(

result_gr.write_to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}

/// Generate a BED3 file of genomic windows.
Expand All @@ -500,8 +515,7 @@ pub fn granges_windows(
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?;
let report = Report::new();
Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}

/// Generate a random BED-like file with genomic ranges.
Expand Down Expand Up @@ -529,6 +543,144 @@ pub fn granges_random_bed(
gr.write_to_tsv(output, &BED_TSV)?;
};

let report = Report::new();
Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), None))
}

/// Merges all the genomic ranges if they overlap by `distance`.
#[derive(Parser)]
pub struct Merge {
/// The input BED-like TSV file to merge.
#[arg(short, long, required = true)]
bedfile: PathBuf,

/// The minimum distance at which to merge ranges. Like `bedtools merge`,
/// `--distance 0` will merge "book-ended" ranges. Negative numbers
/// will only merge ranges that overlap by that degree of overlap.
#[clap(short, long, default_value_t = 0)]
distance: PositionOffset,

///// Whether to "group by" feature name, i.e. overlapping ranges
///// with different feature names will not be merged.
//#[clap(short, long)]
//group_features: usize,
/// Operation to do to summarize the score column.
#[clap(short, long, value_parser = clap::value_parser!(FloatOperation))]
func: Option<FloatOperation>,

/// An optional output file (standard output will be used if not specified)
#[arg(short, long)]
output: Option<PathBuf>,
}

impl Merge {
// TODO optional genome file for validation?
pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
let bedfile = &self.bedfile;
let distance = &self.distance;
let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;
let func = &self.func;

let writer_boxed: Box<dyn io::Write> = match &self.output {
Some(path) => Box::new(File::create(path)?),
None => Box::new(io::stdout()),
};
let mut writer = WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(writer_boxed);

match ranges_iter {
GenomicRangesParser::Bed3(iter) => {
let merging_iter = MergingEmptyResultIterator::new(iter, *distance);
for result in merging_iter {
let record = result?;
writer.serialize(record)?;
}
Ok(CommandOutput::new((), None))
}
GenomicRangesParser::Bed4(iter) => {
let merging_iter = MergingResultIterator::new(iter, *distance, |data| {
data.into_iter()
.map(|x| x.name)
.collect::<Vec<_>>()
.join(",")
});
for result in merging_iter {
let record = result?;
writer.serialize(record)?;
}
Ok(CommandOutput::new((), None))
}
GenomicRangesParser::Bed5(iter) => {
// merging iterator, where we extract scores and apply an operation to all merged genomic ranges' scores
let merging_iter = MergingResultIterator::new(iter, *distance, |data| {
let mut scores: Vec<f64> = data
.into_iter()
.filter_map(|bed5_cols| bed5_cols.score)
.collect();
// this unwrap is safe -- if func is None, we use Bed3
func.as_ref().unwrap().run(&mut scores)
});

for result in merging_iter {
let record = result?;
writer.serialize(record)?;
}
Ok(CommandOutput::new((), None))
}
GenomicRangesParser::Bedlike(_iter) => {
todo!()
}
GenomicRangesParser::Unsupported => {
Err(GRangesError::UnsupportedGenomicRangesFileFormat)
}
}
}
}

/// Filter out ranges not in the specified "genome" file.
#[derive(Parser)]
pub struct FilterChroms {
/// A TSV genome file of chromosome names and their lengths
#[arg(short, long, required = true)]
genome: PathBuf,

/// The input BED file.
#[arg(short, long, required = true)]
bedfile: PathBuf,

/// An optional output file (standard output will be used if not specified)
#[arg(short, long)]
output: Option<PathBuf>,
}

impl FilterChroms {
pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
let bedfile = &self.bedfile;
let genome = read_seqlens(&self.genome)?;

let writer_boxed: Box<dyn io::Write> = match &self.output {
Some(path) => Box::new(File::create(path)?),
None => Box::new(io::stdout()),
};

let mut writer = WriterBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_writer(writer_boxed);

let bedlike_iterator = BedlikeIterator::new(bedfile)?;

// If we don't need to sort, use iterator-based streaming processing.
for record in bedlike_iterator {
let range = record?;
let seqname = &range.seqname;
let passes_filter = genome.contains_key(seqname);
if passes_filter {
writer.serialize(range)?;
}
}

Ok(CommandOutput::new((), None))
}
}
Loading

0 comments on commit ba880b6

Please sign in to comment.