From ba880b691620126aa1178ad7d77e5a3aa516f8b3 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sat, 2 Mar 2024 23:08:42 -0800 Subject: [PATCH] New merging iterators, cleaned up BED parsing, new `merge` command. - Merging iterator prototypes added. These are streaming iterators of sorted input that are merged based on distance or overlap. - Much cleaner BED variant detection, and `Bed4Iterator` method. - Entire organization of parsing modules. - New detection methods that try to use serde deserialization. - The new `merge` command is structured differently than other commands as part of a new cleaner way to build up subcommands in code. - New validation and benchmarks of `granges merge`. - New `granges_test_case_03()` and new test data. - New test utilities, e.g. `validate_bedfloats()` for validating agains bedtools map, etc functions. - Bedlike now based on csv + serde. - New `granges filter-chrom` command. --- Cargo.toml | 4 +- benches/bedtools_comparison.rs | 43 +++ src/commands.rs | 194 ++++++++++-- src/data/operations.rs | 24 +- src/error.rs | 3 + src/granges.rs | 56 +++- src/io/mod.rs | 3 +- src/io/parsers/bed.rs | 284 ----------------- src/io/parsers/bed/bed3.rs | 26 ++ src/io/parsers/bed/bed4.rs | 46 +++ src/io/parsers/bed/bed5.rs | 52 +++ src/io/parsers/bed/bedlike.rs | 161 ++++++++++ src/io/parsers/bed/mod.rs | 99 ++++++ src/io/parsers/detect.rs | 181 +++++++++++ src/io/parsers/filters.rs | 268 ++++++++++++++++ src/io/parsers/mod.rs | 564 +-------------------------------- src/io/parsers/tsv.rs | 52 ++- src/io/parsers/utils.rs | 85 +++++ src/iterators.rs | 106 ++++++- src/lib.rs | 16 +- src/main/mod.rs | 16 +- src/merging_iterators.rs | 475 +++++++++++++++++++++++++++ src/ranges/mod.rs | 42 +++ src/reporting.rs | 4 +- src/sequences/nucleotide.rs | 1 - src/test_utilities.rs | 77 ++++- src/traits.rs | 68 +++- tests/bedtools_validation.rs | 253 +++++++++++---- tests_data/example_bedlike.tsv | 10 +- tests_data/test_case_01.bed | 5 + tests_data/test_case_03.bed | 5 + 31 files changed, 2230 insertions(+), 993 deletions(-) delete mode 100644 src/io/parsers/bed.rs create mode 100644 src/io/parsers/bed/bed3.rs create mode 100644 src/io/parsers/bed/bed4.rs create mode 100644 src/io/parsers/bed/bed5.rs create mode 100644 src/io/parsers/bed/bedlike.rs create mode 100644 src/io/parsers/bed/mod.rs create mode 100644 src/io/parsers/detect.rs create mode 100644 src/io/parsers/filters.rs create mode 100644 src/merging_iterators.rs create mode 100644 tests_data/test_case_01.bed create mode 100644 tests_data/test_case_03.bed diff --git a/Cargo.toml b/Cargo.toml index 1f14e55..95bb0e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "granges" -version = "0.2.0" +version = "0.2.1" edition = "2021" license = "MIT" authors = ["Vince Buffalo "] @@ -14,7 +14,7 @@ description = "A Rust library and command line tool for genomic range operations # clap = { version = "4.4.18", features = ["derive"], optional = true } clap = { version = "4.4.18", features = ["derive"] } coitrees = { version = "0.4.0", features = ["nosimd"] } -flate2 = "1.0.28" +flate2 = { version = "1.0.28", features = ["zlib-ng-compat"] } genomap = "0.2.6" indexmap = "2.2.3" ndarray = { version = "0.15.6", optional = true} diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index 723a3d0..b6eee58 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -357,8 +357,51 @@ fn bench_map_all_operations(c: &mut Criterion) { }); } +fn bench_merge_empty(c: &mut Criterion) { + // create the benchmark group + let mut group = c.benchmark_group("merge_empty"); + + // create the test data + let input_bedfile = random_bed3file(BED_LENGTH); + + // the distance + let distance = 100; + + // configure the sample size for the group + group.sample_size(10); + + group.bench_function("bedtools_merge", |b| { + b.iter(|| { + let bedtools_output = Command::new("bedtools") + .arg("merge") + .arg("-i") + .arg(input_bedfile.path()) + .arg("-d") + .arg(distance.to_string()) + .output() + .expect("bedtools merge failed"); + assert!(bedtools_output.status.success()); + }); + }); + + group.bench_function("granges_merge", |b| { + b.iter(|| { + let granges_output = Command::new(granges_binary_path()) + .arg("merge") + .arg("--bedfile") + .arg(input_bedfile.path()) + .arg("-d") + .arg(distance.to_string()) + .output() + .expect("bedtools merge failed"); + assert!(granges_output.status.success()); + }); + }); +} + criterion_group!( benches, + bench_merge_empty, bench_filter_adjustment, bench_range_adjustment, bench_flank, diff --git a/src/commands.rs b/src/commands.rs index 899435b..2e5a036 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,15 +1,18 @@ //! Command functions that implement each of the `granges` subcommands. +//! +// TODO: these functions should be methods of the input struct. -use std::{fs::File, io, path::PathBuf}; - +use clap::Parser; use csv::WriterBuilder; +use std::{fs::File, io, path::PathBuf}; use crate::{ - data::{operations::Operation, SerializableDatumType}, + data::{operations::FloatOperation, SerializableDatumType}, io::{ parsers::{Bed5Iterator, GenomicRangesParser}, tsv::BED_TSV, }, + merging_iterators::{MergingEmptyResultIterator, MergingResultIterator}, prelude::*, ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty}, reporting::{CommandOutput, Report}, @@ -111,6 +114,11 @@ pub fn granges_adjust( gr.adjust_ranges(-both, both) .write_to_tsv(output, &BED_TSV)? } + GenomicRangesParser::Bed4(iter) => { + let gr = GRanges::from_iter(iter, &genome)?; + gr.adjust_ranges(-both, both) + .write_to_tsv(output, &BED_TSV)? + } GenomicRangesParser::Bed5(iter) => { let gr = GRanges::from_iter(iter, &genome)?; gr.adjust_ranges(-both, both) @@ -130,7 +138,7 @@ pub fn granges_adjust( } } } - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), Some(report))) } /// Filters genomic ranges based on overlaps with another set of ranges. @@ -168,9 +176,6 @@ pub fn granges_filter( let left_iter = GenomicRangesFile::parsing_iterator(left_path)?; let right_iter = GenomicRangesFile::parsing_iterator(right_path)?; - // for reporting stuff to the user - let report = Report::new(); - match (left_iter, right_iter) { (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => { let left_gr; @@ -189,7 +194,7 @@ pub fn granges_filter( let semijoin = left_gr.filter_overlaps(&right_gr)?; semijoin.write_to_tsv(output, &BED_TSV)?; - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) } (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => { let left_gr; @@ -211,7 +216,7 @@ pub fn granges_filter( let semijoin = left_gr.filter_overlaps(&right_gr)?; semijoin.write_to_tsv(output, &BED_TSV)?; - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) } (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => { let left_gr; @@ -231,7 +236,7 @@ pub fn granges_filter( let semijoin = left_gr.filter_overlaps(&right_gr)?; semijoin.write_to_tsv(output, &BED_TSV)?; - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) } (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => { let left_gr; @@ -254,7 +259,7 @@ pub fn granges_filter( let intersection = left_gr.filter_overlaps(&right_gr)?; intersection.write_to_tsv(output, &BED_TSV)?; - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) } _ => Err(GRangesError::UnsupportedGenomicRangesFileFormat), } @@ -297,8 +302,6 @@ pub fn granges_flank( let seqnames: Vec = genome.keys().cloned().collect(); let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; - let report = Report::new(); - match mode { // Note: this is kept for benchmarking, to see how costly building GRanges // objects is versus using streaming. @@ -312,6 +315,16 @@ pub fn granges_flank( gr.flanking_ranges(left, right)? .write_to_tsv(output, &BED_TSV)? } + GenomicRangesParser::Bed4(iter) => { + let gr = if skip_missing { + GRanges::from_iter(iter.retain_seqnames(&seqnames), &genome)? + } else { + GRanges::from_iter(iter, &genome)? + }; + gr.flanking_ranges(left, right)? + .write_to_tsv(output, &BED_TSV)? + } + GenomicRangesParser::Bed5(_iter) => { unimplemented!() } @@ -372,6 +385,9 @@ pub fn granges_flank( } } } + GenomicRangesParser::Bed4(_iter) => { + unimplemented!() + } GenomicRangesParser::Bed5(_iter) => { unimplemented!() } @@ -412,7 +428,7 @@ pub fn granges_flank( } } } - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) } /// # Developer Notes @@ -421,13 +437,12 @@ pub fn granges_map( seqlens: impl Into, left_path: &PathBuf, right_path: &PathBuf, - operations: Vec, + operations: Vec, output: Option<&PathBuf>, skip_missing: bool, ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); - let report = Report::new(); let left_iter = Bed3Iterator::new(left_path)?; let right_iter = Bed5Iterator::new(right_path)?; @@ -487,7 +502,7 @@ pub fn granges_map( result_gr.write_to_tsv(output, &BED_TSV)?; - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) } /// Generate a BED3 file of genomic windows. @@ -500,8 +515,7 @@ pub fn granges_windows( ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?; - let report = Report::new(); - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) } /// Generate a random BED-like file with genomic ranges. @@ -529,6 +543,144 @@ pub fn granges_random_bed( gr.write_to_tsv(output, &BED_TSV)?; }; - let report = Report::new(); - Ok(CommandOutput::new((), report)) + Ok(CommandOutput::new((), None)) +} + +/// Merges all the genomic ranges if they overlap by `distance`. +#[derive(Parser)] +pub struct Merge { + /// The input BED-like TSV file to merge. + #[arg(short, long, required = true)] + bedfile: PathBuf, + + /// The minimum distance at which to merge ranges. Like `bedtools merge`, + /// `--distance 0` will merge "book-ended" ranges. Negative numbers + /// will only merge ranges that overlap by that degree of overlap. + #[clap(short, long, default_value_t = 0)] + distance: PositionOffset, + + ///// Whether to "group by" feature name, i.e. overlapping ranges + ///// with different feature names will not be merged. + //#[clap(short, long)] + //group_features: usize, + /// Operation to do to summarize the score column. + #[clap(short, long, value_parser = clap::value_parser!(FloatOperation))] + func: Option, + + /// An optional output file (standard output will be used if not specified) + #[arg(short, long)] + output: Option, +} + +impl Merge { + // TODO optional genome file for validation? + pub fn run(&self) -> Result, GRangesError> { + let bedfile = &self.bedfile; + let distance = &self.distance; + let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; + let func = &self.func; + + let writer_boxed: Box = match &self.output { + Some(path) => Box::new(File::create(path)?), + None => Box::new(io::stdout()), + }; + let mut writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_writer(writer_boxed); + + match ranges_iter { + GenomicRangesParser::Bed3(iter) => { + let merging_iter = MergingEmptyResultIterator::new(iter, *distance); + for result in merging_iter { + let record = result?; + writer.serialize(record)?; + } + Ok(CommandOutput::new((), None)) + } + GenomicRangesParser::Bed4(iter) => { + let merging_iter = MergingResultIterator::new(iter, *distance, |data| { + data.into_iter() + .map(|x| x.name) + .collect::>() + .join(",") + }); + for result in merging_iter { + let record = result?; + writer.serialize(record)?; + } + Ok(CommandOutput::new((), None)) + } + GenomicRangesParser::Bed5(iter) => { + // merging iterator, where we extract scores and apply an operation to all merged genomic ranges' scores + let merging_iter = MergingResultIterator::new(iter, *distance, |data| { + let mut scores: Vec = data + .into_iter() + .filter_map(|bed5_cols| bed5_cols.score) + .collect(); + // this unwrap is safe -- if func is None, we use Bed3 + func.as_ref().unwrap().run(&mut scores) + }); + + for result in merging_iter { + let record = result?; + writer.serialize(record)?; + } + Ok(CommandOutput::new((), None)) + } + GenomicRangesParser::Bedlike(_iter) => { + todo!() + } + GenomicRangesParser::Unsupported => { + Err(GRangesError::UnsupportedGenomicRangesFileFormat) + } + } + } +} + +/// Filter out ranges not in the specified "genome" file. +#[derive(Parser)] +pub struct FilterChroms { + /// A TSV genome file of chromosome names and their lengths + #[arg(short, long, required = true)] + genome: PathBuf, + + /// The input BED file. + #[arg(short, long, required = true)] + bedfile: PathBuf, + + /// An optional output file (standard output will be used if not specified) + #[arg(short, long)] + output: Option, +} + +impl FilterChroms { + pub fn run(&self) -> Result, GRangesError> { + let bedfile = &self.bedfile; + let genome = read_seqlens(&self.genome)?; + + let writer_boxed: Box = match &self.output { + Some(path) => Box::new(File::create(path)?), + None => Box::new(io::stdout()), + }; + + let mut writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_writer(writer_boxed); + + let bedlike_iterator = BedlikeIterator::new(bedfile)?; + + // If we don't need to sort, use iterator-based streaming processing. + for record in bedlike_iterator { + let range = record?; + let seqname = &range.seqname; + let passes_filter = genome.contains_key(seqname); + if passes_filter { + writer.serialize(range)?; + } + } + + Ok(CommandOutput::new((), None)) + } } diff --git a/src/data/operations.rs b/src/data/operations.rs index 05245a2..d2feffd 100644 --- a/src/data/operations.rs +++ b/src/data/operations.rs @@ -33,7 +33,7 @@ pub fn median(numbers: &mut [F]) -> Option { /// The (subset of) standard `bedtools map` operations. #[derive(Clone, Debug, ValueEnum)] -pub enum Operation { +pub enum FloatOperation { /// Calculate the sum of all values (a set of zero elements has sum 0.0). Sum, /// Calculate the sum of all values, but set of zero elements is a missing value, not 0.0. @@ -50,25 +50,25 @@ pub enum Operation { Collapse, } -impl Operation { +impl FloatOperation { #[inline(always)] pub fn run(&self, data: &mut [T]) -> DatumType where T: Float + Sum + ToPrimitive + Clone + ToString, { match self { - Operation::Sum => { + FloatOperation::Sum => { let sum: T = data.iter().copied().sum(); sum.into_data_type() } - Operation::SumNotEmpty => { + FloatOperation::SumNotEmpty => { if data.is_empty() { return DatumType::NoValue; } let sum: T = data.iter().copied().sum(); sum.into_data_type() } - Operation::Min => { + FloatOperation::Min => { let min = data .iter() .filter(|x| x.is_finite()) @@ -76,7 +76,7 @@ impl Operation { .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater)); min.map_or(DatumType::NoValue, |x| x.into_data_type()) } - Operation::Max => { + FloatOperation::Max => { let max = data .iter() .filter(|x| x.is_finite()) @@ -84,7 +84,7 @@ impl Operation { .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Less)); max.map_or(DatumType::NoValue, |x| x.into_data_type()) } - Operation::Mean => { + FloatOperation::Mean => { if data.is_empty() { DatumType::NoValue } else { @@ -93,8 +93,10 @@ impl Operation { mean.into_data_type() } } - Operation::Median => median(data).map_or(DatumType::NoValue, |x| x.into_data_type()), - Operation::Collapse => { + FloatOperation::Median => { + median(data).map_or(DatumType::NoValue, |x| x.into_data_type()) + } + FloatOperation::Collapse => { let collapsed = data .iter() .map(|num| num.to_string()) @@ -106,6 +108,10 @@ impl Operation { } } +pub enum StringOperation { + Collapse, +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/error.rs b/src/error.rs index 3204519..7211d2e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -75,6 +75,9 @@ pub enum GRangesError { #[error("File reading error: {0}. Please check if the file exists and you have permission to read it.")] IOError(#[from] std::io::Error), + #[error("The specified file '{0}' is empty.")] + EmptyFile(String), + #[error("File parsing error: {0}")] TsvParsingError(#[from] csv::Error), diff --git a/src/granges.rs b/src/granges.rs index 62d4048..4dcd600 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -47,7 +47,7 @@ use serde::Serialize; use crate::{ ensure_eq, io::tsv::TsvConfig, - iterators::GRangesIterator, + iterators::{GRangesIterator, GRangesRecordIterator}, join::{ CombinedJoinData, CombinedJoinDataBothEmpty, CombinedJoinDataLeftEmpty, CombinedJoinDataRightEmpty, JoinData, JoinDataBothEmpty, JoinDataLeftEmpty, @@ -1315,6 +1315,39 @@ where Ok(gr) } + ///// Retain the *intersection* of each left genomic ranges that has at least one + ///// overlap with the `right` set of genomic ranges. + ///// + ///// This will retain only the overlapping portion + ///// + ///// This is a type of *filtering join*, in particular a *semi-join*. + ///// See Hadley Wickham's [R for Data + ///// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. + //pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( + // self, + // // right: &GRanges, DR>, + // right: &'a impl AsGRangesRef<'a, COITrees, DR>, + //) -> Result, GRangesError> { + // let mut gr = GRangesEmpty::new_vec(&self.seqlens()); + + // let right_ref = right.as_granges_ref(); + + // for (seqname, left_ranges) in self.0.ranges.iter() { + // for left_range in left_ranges.iter_ranges() { + // if let Some(right_ranges) = right_ref.ranges.get(seqname) { + // let num_overlaps = + // right_ranges.count_overlaps(left_range.start(), left_range.end()); + // if num_overlaps == 0 { + // // no overlaps -- skip + // } else { + // gr.push_range(seqname, left_range.start(), left_range.end())?; + // } + // } + // } + // } + // Ok(gr) + //} + /// Retain only genomic ranges that have at least one overlap with the `right` /// set of genomic ranges. The whole range will be retained. /// @@ -1471,6 +1504,17 @@ where } } +impl GRanges +where + R: IterableRangeContainer, + T: IndexedDataContainer, +{ + /// Create a new [`GRangesRecordIterator`] to iterate through all the ranges in this [`GRanges`] object. + pub fn iter_records(&self) -> GRangesRecordIterator<'_, R, T> { + GRangesRecordIterator::new(self) + } +} + /// [`PartialEq`] for [`GRanges`] objects. /// /// This is a more powerful comparison operator than [`GRanges.is_equal_to()`], since it will first @@ -1512,6 +1556,7 @@ where #[cfg(test)] mod tests { use crate::{ + iterators::GRangesRecordIterator, prelude::*, test_utilities::{granges_test_case_01, granges_test_case_02, random_vecranges}, Position, @@ -1901,4 +1946,13 @@ mod tests { assert_eq!(first_chrom.0, "chr1"); assert_eq!(*first_chrom.1, vec![2, 5, 13]); } + + #[test] + fn test_grange_iter_records() { + // main test is in iterators.rs module. + let gr = granges_test_case_01(); + let grr_iter = GRangesRecordIterator::new(&gr); + let iter = gr.iter_records(); + assert!(grr_iter.zip(iter).all(|(a, b)| a == b)); + } } diff --git a/src/io/mod.rs b/src/io/mod.rs index d804f08..d33c373 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -6,7 +6,8 @@ pub mod tsv; pub use file::{InputStream, OutputStream}; pub use parsers::{ - bed::Bed3Iterator, bed::Bed5Iterator, bed::BedlikeIterator, tsv::TsvRecordIterator, + bed::{Bed3Iterator, Bed4Iterator, Bed5Iterator, BedlikeIterator}, + tsv::TsvRecordIterator, GenomicRangesFile, GenomicRangesParser, }; pub use tsv::{TsvConfig, BED_TSV}; diff --git a/src/io/parsers/bed.rs b/src/io/parsers/bed.rs deleted file mode 100644 index d1811e3..0000000 --- a/src/io/parsers/bed.rs +++ /dev/null @@ -1,284 +0,0 @@ -//! BED Types and Functionality -//! -//! The BED (Browser Extensible Format) is a TSV format in bioinformatics. -//! It has a fairly strict [specification](https://samtools.github.io/hts-specs/BEDv1.pdf), -//! but in practice it is quite permissive, and in bioinformatics one encounters lots -//! of "BED-like" files. -//! -//! # Design -//! -//! Since BED files can be thought of BED3 + an optional *addition*, the type -//! returned by these parsing iterators is [`GenomicRangeRecord`] where -//! `U` is some sort of addition type like [`Bed5Addition`]. -//! -//! # ⚠️ Stability -//! -//! This module defines core BED types, but is under active development. -//! - -use serde::de::Error as DeError; -use serde::{Deserialize, Deserializer, Serialize}; -use std::io::{BufRead, BufReader}; -use std::path::PathBuf; -use std::str::FromStr; - -use crate::error::GRangesError; -use crate::io::InputStream; -use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}; -use crate::Position; - -use super::parse_column; -use super::tsv::TsvRecordIterator; - -// for BedlikeIterator only -pub const PARSE_CAPACITY: usize = 512; - -/// [`serde`] deserializer for a BED column with a possibly missing value. Note that the [BED -/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf) only technically allows `'.'` to -/// be used for missing strands, but in practice it can be found to represent -/// missing scores, etc too. -pub fn bed_missing<'de, D, T>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, - T: Deserialize<'de> + FromStr, - ::Err: std::fmt::Display, -{ - let missing_chars = &["."]; - deserialize_option_generic(deserializer, missing_chars) // Use the generic deserializer with specific placeholders -} - -/// The additional two BED5 columns. -/// -/// # Fields -/// * `name`: the feature name. -/// * `score`: a score. -// TODO/RENAME: maybe since this goes against spec it should -// be called Bed5AdditionPermissive? -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] -#[serde(deny_unknown_fields)] -pub struct Bed5Addition { - pub name: String, - #[serde(deserialize_with = "bed_missing")] - pub score: Option, -} - -/// An iterator over BED5 entries, which contain the three -/// range entries (sequence name, start and end positions), -/// a feature name, and a score. -/// -/// Note that the [`Bed5Addition`] is *permissive* in the sense -/// it allows for more input types than the official [BED format -/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf). -// TODO strict type? -#[derive(Debug)] -pub struct Bed5Iterator { - iter: TsvRecordIterator>, -} - -impl Bed5Iterator { - /// Creates a parsing iterator over a BED5 file. - pub fn new(filepath: impl Into) -> Result { - let iter = TsvRecordIterator::new(filepath)?; - - Ok(Self { iter }) - } -} - -impl Iterator for Bed5Iterator { - type Item = Result, GRangesError>; - - fn next(&mut self) -> Option { - self.iter.next() - } -} - -/// An iterator over BED3 entries (which just contain ranges no data). -#[derive(Debug)] -pub struct Bed3Iterator { - iter: TsvRecordIterator, -} - -impl Bed3Iterator { - /// Creates a parsing iterator over a BED5 file. - pub fn new(filepath: impl Into) -> Result { - let iter = TsvRecordIterator::new(filepath)?; - Ok(Self { iter }) - } -} - -impl Iterator for Bed3Iterator { - type Item = Result; - fn next(&mut self) -> Option { - self.iter.next() - } -} - -/// Nucleotide strand enum type. -#[derive(Clone, Debug)] -pub enum Strand { - Forward, - Reverse, -} - -/// Deserializes some value of type `t` with some possible missing -/// character `missing_chars` into [`Option`]. -pub fn deserialize_option_generic<'de, D, T>( - deserializer: D, - missing_chars: &'de [&'de str], -) -> Result, D::Error> -where - D: Deserializer<'de>, - T: Deserialize<'de> + FromStr, - ::Err: std::fmt::Display, -{ - let s: String = Deserialize::deserialize(deserializer)?; - if missing_chars.contains(&s.as_str()) { - Ok(None) - } else { - s.parse::() - .map(Some) - .map_err(|e| DeError::custom(format!("parsing error: {}", e))) - } -} - -//impl Selection for &Bed5Addition { -// fn select_by_name(&self, name: &str) -> DatumType { -// match name { -// "name" => DatumType::String(self.name.clone()), -// "score" => DatumType::Float64(self.score), -// _ => panic!("No item named '{}'", name), -// } -// } -//} - -///// The additional three BED6 columns. -//#[derive(Clone, Debug)] -//pub struct Bed6Addition { -// pub name: String, -// pub score: f64, -// pub strand: Option, -//} - -/// A lazy parser for BED-like files. -/// yields [`GenomicRangeRecord>>`] entries. If the file is a BED3 file, -/// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining -/// string columns to parse. -pub struct BedlikeIterator { - reader: BufReader>, - line_buffer: String, -} - -impl std::fmt::Debug for BedlikeIterator { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("BedlikeIterator").finish_non_exhaustive() - } -} - -impl BedlikeIterator { - /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser - /// assumes the first three columns are the sequence name, start (0-indexed and inclusive), - /// and end (0-indeed and exclusive) positions. - pub fn new(filepath: impl Into) -> Result { - let input_file = InputStream::new(filepath); - // let _has_metadata = input_file.collect_metadata("#", None); - // let reader = input_file.continue_reading()?; - let reader = input_file.reader()?; - let line_buffer = String::with_capacity(PARSE_CAPACITY); - Ok(Self { - reader, - line_buffer, - }) - } -} - -impl Iterator for BedlikeIterator { - type Item = Result>, GRangesError>; - - fn next(&mut self) -> Option { - self.line_buffer.clear(); - - loop { - self.line_buffer.clear(); - match self.reader.read_line(&mut self.line_buffer) { - Ok(0) => return None, - Ok(_) => { - if !self.line_buffer.starts_with('#') { - let line = self.line_buffer.trim_end(); - return Some(parse_bed_lazy(line)); - } - // skip the metadata/comment character - } - Err(e) => return Some(Err(GRangesError::IOError(e))), - } - } - } -} - -/// Lazily parses a BED* format line into the first three columns defining the range, -/// storing the rest as a `String`. -/// -/// Unlike [`parse_bedlike()`], this function returns a concrete -/// [`GenomicRangeRecord>`] type, not a [`Vec`] of split TSV columns. -pub fn parse_bed_lazy(line: &str) -> Result>, GRangesError> { - let columns: Vec<&str> = line.splitn(4, '\t').collect(); - if columns.len() < 3 { - return Err(GRangesError::Bed3TooFewColumns( - columns.len(), - line.to_string(), - )); - } - - let seqname = parse_column(columns[0], line)?; - let start: Position = parse_column(columns[1], line)?; - let end: Position = parse_column(columns[2], line)?; - - let data = if columns.len() > 3 { - Some(columns[3].to_string()) - } else { - None - }; - - Ok(GenomicRangeRecord { - seqname, - start, - end, - data, - }) -} - -#[allow(dead_code)] -fn parse_strand(symbol: char) -> Result, GRangesError> { - match symbol { - '+' => Ok(Some(Strand::Forward)), - '-' => Ok(Some(Strand::Reverse)), - '.' => Ok(None), - _ => Err(GRangesError::InvalidString), - } -} - -// TODO -///// Parses a BED6 format line into the three columns defining the range, and additional -///// columns -///// -//pub fn parse_bed6(line: &str) -> Result, GRangesError> { -// let columns: Vec<&str> = line.splitn(4, '\t').collect(); -// if columns.len() < 3 { -// return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); -// } -// -// let seqname = parse_column(columns[0], line)?; -// let start: Position = parse_column(columns[1], line)?; -// let end: Position = parse_column(columns[2], line)?; -// -// let name = parse_column(columns[3], line)?; -// // let strand: Option = parse_strand(parse_column(columns[3], line)?)?; -// -// let data = Bed5Addition { name, score }; -// -// Ok(GenomicRangeRecord { -// seqname, -// start, -// end, -// data, -// }) -//} diff --git a/src/io/parsers/bed/bed3.rs b/src/io/parsers/bed/bed3.rs new file mode 100644 index 0000000..0458b99 --- /dev/null +++ b/src/io/parsers/bed/bed3.rs @@ -0,0 +1,26 @@ +//! BED3 Parsers, which are built off of the [`GenomicRangeRecordEmpty`]. +//! + +use crate::{io::TsvRecordIterator, ranges::GenomicRangeRecordEmpty, GRangesError}; +use std::path::PathBuf; + +/// An iterator over BED3 entries (which just contain ranges no data). +#[derive(Debug)] +pub struct Bed3Iterator { + iter: TsvRecordIterator, +} + +impl Bed3Iterator { + /// Creates a parsing iterator over a BED5 file. + pub fn new(filepath: impl Into) -> Result { + let iter = TsvRecordIterator::new(filepath)?; + Ok(Self { iter }) + } +} + +impl Iterator for Bed3Iterator { + type Item = Result; + fn next(&mut self) -> Option { + self.iter.next() + } +} diff --git a/src/io/parsers/bed/bed4.rs b/src/io/parsers/bed/bed4.rs new file mode 100644 index 0000000..6e03b1d --- /dev/null +++ b/src/io/parsers/bed/bed4.rs @@ -0,0 +1,46 @@ +//! BED4 Parsers, which are built off of the [`GenomicRangeRecordEmpty`] +//! and [`Bed4Addition`]. + +use crate::{io::TsvRecordIterator, ranges::GenomicRangeRecord, GRangesError}; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +/// The additional two BED5 columns. +/// +/// # Fields +/// * `name`: the feature name. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +#[serde(deny_unknown_fields)] +pub struct Bed4Addition { + pub name: String, +} + +/// An iterator over BED4 entries, which contain the three +/// range entries (sequence name, start and end positions), +/// and a feature name +/// +/// Note that the [`Bed5Addition`] is *permissive* in the sense +/// it allows for more input types than the official [BED format +/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf). +// TODO strict type? +#[derive(Debug)] +pub struct Bed4Iterator { + iter: TsvRecordIterator>, +} + +impl Bed4Iterator { + /// Creates a parsing iterator over a BED4 file. + pub fn new(filepath: impl Into) -> Result { + let iter = TsvRecordIterator::new(filepath)?; + + Ok(Self { iter }) + } +} + +impl Iterator for Bed4Iterator { + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + self.iter.next() + } +} diff --git a/src/io/parsers/bed/bed5.rs b/src/io/parsers/bed/bed5.rs new file mode 100644 index 0000000..cf0e7cb --- /dev/null +++ b/src/io/parsers/bed/bed5.rs @@ -0,0 +1,52 @@ +//! BED5 Parsers, which are built off of the [`GenomicRangeRecordEmpty`] +//! and [`Bed5Addition`]. + +use super::bed_missing; +use crate::{io::TsvRecordIterator, ranges::GenomicRangeRecord, GRangesError}; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +/// The additional two BED5 columns. +/// +/// # Fields +/// * `name`: the feature name. +/// * `score`: a score. +// TODO/RENAME: maybe since this goes against spec it should +// be called Bed5AdditionPermissive? +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +#[serde(deny_unknown_fields)] +pub struct Bed5Addition { + pub name: String, + #[serde(deserialize_with = "bed_missing")] + pub score: Option, +} + +/// An iterator over BED5 entries, which contain the three +/// range entries (sequence name, start and end positions), +/// a feature name, and a score. +/// +/// Note that the [`Bed5Addition`] is *permissive* in the sense +/// it allows for more input types than the official [BED format +/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf). +// TODO strict type? +#[derive(Debug)] +pub struct Bed5Iterator { + iter: TsvRecordIterator>, +} + +impl Bed5Iterator { + /// Creates a parsing iterator over a BED5 file. + pub fn new(filepath: impl Into) -> Result { + let iter = TsvRecordIterator::new(filepath)?; + + Ok(Self { iter }) + } +} + +impl Iterator for Bed5Iterator { + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + self.iter.next() + } +} diff --git a/src/io/parsers/bed/bedlike.rs b/src/io/parsers/bed/bedlike.rs new file mode 100644 index 0000000..3244e67 --- /dev/null +++ b/src/io/parsers/bed/bedlike.rs @@ -0,0 +1,161 @@ +//! Bed-like file lazy parsers. + +use std::{ + io::{BufRead, BufReader}, + path::PathBuf, +}; + +use crate::{ + io::{ + parsers::{tsv::build_tsv_reader, utils::parse_column}, + InputStream, + }, + ranges::GenomicRangeRecord, + GRangesError, Position, +}; + +// for BedlikeIterator only, TODO needed? +pub const PARSE_CAPACITY: usize = 512; + +/// A lazy parser for BED-like files. +/// yields [`GenomicRangeRecord>>`] entries. If the file is a BED3 file, +/// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining +/// string columns to parse. +pub struct BedlikeIterator { + reader: BufReader>, + line_buffer: String, +} + +impl std::fmt::Debug for BedlikeIterator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BedlikeIterator").finish_non_exhaustive() + } +} + +impl BedlikeIterator { + /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser + /// assumes the first three columns are the sequence name, start (0-indexed and inclusive), + /// and end (0-indeed and exclusive) positions. + pub fn new(filepath: impl Into) -> Result { + let input_file = InputStream::new(filepath); + // let _has_metadata = input_file.collect_metadata("#", None); + // let reader = input_file.continue_reading()?; + let reader = input_file.reader()?; + let line_buffer = String::with_capacity(PARSE_CAPACITY); + Ok(Self { + reader, + line_buffer, + }) + } +} + +impl Iterator for BedlikeIterator { + type Item = Result>, GRangesError>; + + fn next(&mut self) -> Option { + self.line_buffer.clear(); + + loop { + self.line_buffer.clear(); + match self.reader.read_line(&mut self.line_buffer) { + Ok(0) => return None, + Ok(_) => { + if !self.line_buffer.starts_with('#') { + let line = self.line_buffer.trim_end(); + return Some(parse_bed_lazy(line)); + } + // skip the metadata/comment character + } + Err(e) => return Some(Err(GRangesError::IOError(e))), + } + } + } +} + +/// Lazily parses a BED* format line into the first three columns defining the range, +/// storing the rest as a `String`. +/// +/// Unlike [`parse_bedlike()`], this function returns a concrete +/// [`GenomicRangeRecord>`] type, not a [`Vec`] of split TSV columns. +pub fn parse_bed_lazy(line: &str) -> Result>, GRangesError> { + let columns: Vec<&str> = line.splitn(4, '\t').collect(); + if columns.len() < 3 { + return Err(GRangesError::Bed3TooFewColumns( + columns.len(), + line.to_string(), + )); + } + + let seqname = parse_column(columns[0], line)?; + let start: Position = parse_column(columns[1], line)?; + let end: Position = parse_column(columns[2], line)?; + + let data = if columns.len() > 3 { + Some(columns[3].to_string()) + } else { + None + }; + + Ok(GenomicRangeRecord { + seqname, + start, + end, + data, + }) +} + +/// Inspect the first line to check that it looks like a valid BED-like +/// file, i.e. the first column is there (there are no reasonable checks +/// for sequence names other than presence), and the next to columns can +/// be parsed into a [`Position`]. +// TODO/NOTE: this is an older-style parser. We might want to try +// using csv here. + +pub fn valid_bedlike(filepath: impl Into) -> Result { + let filepath = filepath.into(); + let mut reader = build_tsv_reader(filepath)?; + let mut records = reader.records(); + if let Some(result) = records.next() { + let record = result?; + + if record.len() < 3 { + // too few columns to be BED-like + return Ok(false); + } + + // Attempt to parse the second and third columns as positions + let start_result = record.get(1).unwrap().trim().parse::(); + let end_result = record.get(2).unwrap().trim().parse::(); + + // Check if both positions are valid + match (start_result, end_result) { + (Ok(_), Ok(_)) => Ok(true), // both are valid + _ => Ok(false), // one or both is not valid + } + } else { + // No records found + Ok(false) + } +} + +#[cfg(test)] +mod tests { + use super::valid_bedlike; + + #[test] + fn test_valid_bedlike() { + // even bed files work + assert_eq!(valid_bedlike("tests_data/example.bed").unwrap(), true); + + // but not everything + assert_eq!( + valid_bedlike("tests_data/invalid_format.bed").unwrap(), + false + ); + + assert_eq!( + valid_bedlike("tests_data/example_bedlike.tsv").unwrap(), + true + ); + } +} diff --git a/src/io/parsers/bed/mod.rs b/src/io/parsers/bed/mod.rs new file mode 100644 index 0000000..80fdd46 --- /dev/null +++ b/src/io/parsers/bed/mod.rs @@ -0,0 +1,99 @@ +//! BED Types and Functionality +//! +//! The BED (Browser Extensible Format) is a TSV format in bioinformatics. +//! It has a fairly strict [specification](https://samtools.github.io/hts-specs/BEDv1.pdf), +//! but in practice it is quite permissive, and in bioinformatics one encounters lots +//! of "BED-like" files. +//! +//! # Design +//! +//! Since BED files can be thought of BED3 + an optional *addition*, the type +//! returned by these parsing iterators is [`GenomicRangeRecord`] where +//! `U` is some sort of addition type like [`Bed5Addition`]. +//! +//! # ⚠️ Stability +//! +//! This module defines core BED types, but is under active development. +//! + +pub mod bed3; +pub mod bed4; +pub mod bed5; +pub mod bedlike; + +pub use bed3::Bed3Iterator; +pub use bed4::{Bed4Addition, Bed4Iterator}; +pub use bed5::{Bed5Addition, Bed5Iterator}; +pub use bedlike::{valid_bedlike, BedlikeIterator}; + +use serde::de::Error as DeError; +use serde::{Deserialize, Deserializer}; +use std::str::FromStr; + +/// [`serde`] deserializer for a BED column with a possibly missing value. Note that the [BED +/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf) only technically allows `'.'` to +/// be used for missing strands, but in practice it can be found to represent +/// missing scores, etc too. +pub fn bed_missing<'de, D, T>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, + T: Deserialize<'de> + FromStr, + ::Err: std::fmt::Display, +{ + let missing_chars = &["."]; + deserialize_option_generic(deserializer, missing_chars) // Use the generic deserializer with specific placeholders +} + +/// Nucleotide strand enum type. +#[derive(Clone, Debug)] +pub enum Strand { + Forward, + Reverse, +} + +/// Deserializes some value of type `t` with some possible missing +/// character `missing_chars` into [`Option`]. +pub fn deserialize_option_generic<'de, D, T>( + deserializer: D, + missing_chars: &'de [&'de str], +) -> Result, D::Error> +where + D: Deserializer<'de>, + T: Deserialize<'de> + FromStr, + ::Err: std::fmt::Display, +{ + let s: String = Deserialize::deserialize(deserializer)?; + if missing_chars.contains(&s.as_str()) { + Ok(None) + } else { + s.parse::() + .map(Some) + .map_err(|e| DeError::custom(format!("parsing error: {}", e))) + } +} +// TODO +///// Parses a BED6 format line into the three columns defining the range, and additional +///// columns +///// +//pub fn parse_bed6(line: &str) -> Result, GRangesError> { +// let columns: Vec<&str> = line.splitn(4, '\t').collect(); +// if columns.len() < 3 { +// return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); +// } +// +// let seqname = parse_column(columns[0], line)?; +// let start: Position = parse_column(columns[1], line)?; +// let end: Position = parse_column(columns[2], line)?; +// +// let name = parse_column(columns[3], line)?; +// // let strand: Option = parse_strand(parse_column(columns[3], line)?)?; +// +// let data = Bed5Addition { name, score }; +// +// Ok(GenomicRangeRecord { +// seqname, +// start, +// end, +// data, +// }) +//} diff --git a/src/io/parsers/detect.rs b/src/io/parsers/detect.rs new file mode 100644 index 0000000..54f0312 --- /dev/null +++ b/src/io/parsers/detect.rs @@ -0,0 +1,181 @@ +//! Filetype detection functionality. +//! + +use serde::Deserialize; +use std::path::PathBuf; + +use super::{ + bed::{valid_bedlike, Bed4Addition, Bed4Iterator}, + tsv::build_tsv_reader, + utils::get_base_extension, + Bed3Iterator, Bed5Addition, Bed5Iterator, BedlikeIterator, +}; +use crate::{ + ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}, + GRangesError, +}; + +/// Enum that connects a genomic ranges file type to its specific parser. +#[derive(Debug)] +pub enum GenomicRangesParser { + Bed3(Bed3Iterator), + Bed4(Bed4Iterator), + Bed5(Bed5Iterator), + Bedlike(BedlikeIterator), + Unsupported, +} + +/// Enum that indicates the filetype of some genomic ranges file. +#[derive(Debug, PartialEq)] +pub enum GenomicRangesFile { + Bed3(PathBuf), + Bed4(PathBuf), + Bed5(PathBuf), + Bedlike(PathBuf), + Unsupported, +} + +/// Detect the BED variant by trying to deserialize the first line. +pub fn detect_bed_variant( + filepath: impl Into, +) -> Result, GRangesError> { + let filepath = filepath.into(); + + if try_deserialize::>(&filepath)? { + Ok(Some(GenomicRangesFile::Bed5(filepath))) + } else if try_deserialize::>(&filepath)? { + Ok(Some(GenomicRangesFile::Bed4(filepath))) + } else if try_deserialize::(&filepath)? { + Ok(Some(GenomicRangesFile::Bed3(filepath))) + } else { + Ok(None) + } +} + +/// Try to deserialize into a generic type `T`. +fn try_deserialize Deserialize<'de> + std::fmt::Debug>( + filepath: impl Into, +) -> Result { + let filepath = filepath.into(); + let reader = build_tsv_reader(&filepath)?; + let mut iter = reader.into_deserialize::(); + let next_item = iter.next(); + if let Some(result) = next_item { + Ok(result.is_ok()) + } else { + Err(GRangesError::EmptyFile( + filepath.to_string_lossy().to_string(), + )) + } +} + +impl GenomicRangesFile { + /// Detect the type of range genomic range file type we are working with, and output + /// the appropriate [`GenomicRangesFile`] enum variant. + /// + /// Detection works like this: + /// 1. Skip comment lines, starting with `#`. + /// 2. Retrieve extension, removing any additional compression-related + /// extensions (`.gz` and `.bgz`) if present. + /// 3. Read the first line, and try to parse the second and third columns + /// into [`Position`] types, to check if this is a BED-like file (i.e. + /// the first three columns are sequence name, start, and end positions). + /// 4. Match against + /// + /// Currently this supports: + /// 1. BED3 files, without range data. + /// 2. BED-like files, with BED3 first columns and optional additional columns + /// after that. If there is no additional columnar data after the first + /// three BED3 columns, the type is [`GenomicRangesFile::Bed3`]. If there + /// is additional columnar data, it is [`GenomicRangesFile::Bedlike`]. + /// 3. Files with `.tsv` extension but are BED-like (first three + /// columns are BED3) will have the type [`GenomicRangesFile::Bed3`]. This + /// is because downstream [`GRanges`] operations need to know if any + /// additional data is present, which would need to be put in a data container. + /// 4. BED5 files, which are BED3 + a *feature name* and a *strand* column. + /// 5. If the file type does not satisfy any of the rules above, it is + /// [`GenomicRangesFile::Unsupported`]. + /// + /// See the `match` statement in the source code for the exact rules. Additional + /// genomic range file formats like GTF/GFF can easily be added later. + /// + /// [`GRanges`]: crate::granges::GRanges + pub fn detect(filepath: impl Into) -> Result { + let filepath: PathBuf = filepath.into(); + + let is_valid_bedlike = valid_bedlike(&filepath)?; + + // get the extension, as a hint + let extension = + get_base_extension(&filepath).ok_or(GRangesError::CouldNotDetectRangesFiletype)?; + + // If it's got a .tsv extension, take this as a hint it *isn't a BED*, + // thus, this goes to the BedlikeIterator parser. + if extension.ends_with("tsv") && is_valid_bedlike { + return Ok(GenomicRangesFile::Bedlike(filepath)); + } + + // Let's try the strict serde-based deserialization approach first. + if let Some(bed_filetype) = detect_bed_variant(&filepath)? { + return Ok(bed_filetype); + } + + if is_valid_bedlike { + return Ok(GenomicRangesFile::Bedlike(filepath)); + } + + Ok(GenomicRangesFile::Unsupported) + } + + /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error + /// if the filetype is not supported. + /// + /// This returns a [`GenomicRangesParser`] enum, since the parsing iterator filetype + /// cannot be known at compile time. + pub fn parsing_iterator( + filepath: impl Clone + Into, + ) -> Result { + let path = filepath.into(); + match Self::detect(path)? { + GenomicRangesFile::Bed3(path) => { + Ok(GenomicRangesParser::Bed3(Bed3Iterator::new(path)?)) + } + GenomicRangesFile::Bed4(path) => { + Ok(GenomicRangesParser::Bed4(Bed4Iterator::new(path)?)) + } + GenomicRangesFile::Bed5(path) => { + Ok(GenomicRangesParser::Bed5(Bed5Iterator::new(path)?)) + } + GenomicRangesFile::Bedlike(path) => { + Ok(GenomicRangesParser::Bedlike(BedlikeIterator::new(path)?)) + } + GenomicRangesFile::Unsupported => Err(GRangesError::UnsupportedGenomicRangesFileFormat), + } + } +} + +#[cfg(test)] +mod tests { + use super::GenomicRangesFile; + + #[test] + fn test_rangefiletype_detect() { + let range_filetype = GenomicRangesFile::detect("tests_data/example.bed"); + assert!(matches!( + range_filetype.unwrap(), + GenomicRangesFile::Bed3(_) + )); + + let range_filetype = GenomicRangesFile::detect("tests_data/example_bedlike.tsv"); + assert!(matches!( + range_filetype.unwrap(), + GenomicRangesFile::Bedlike(_) + )); + + let range_filetype = GenomicRangesFile::detect("tests_data/test_case_03.bed"); + assert!(matches!( + range_filetype.unwrap(), + GenomicRangesFile::Bed5(_) + )); + } +} diff --git a/src/io/parsers/filters.rs b/src/io/parsers/filters.rs new file mode 100644 index 0000000..d179ebc --- /dev/null +++ b/src/io/parsers/filters.rs @@ -0,0 +1,268 @@ +//! Filters for parsing iterators. + +use crate::error::GRangesError; +use crate::io::TsvRecordIterator; +use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}; +use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable}; +use std::collections::HashSet; + +use super::bed::{Bed4Addition, Bed4Iterator}; +use super::{Bed3Iterator, Bed5Addition, Bed5Iterator, BedlikeIterator}; + +/// An iterator over a generic "genomic range like " item type `R`, that filters based on sequence name. +/// +/// Note that that the exclude filter is prioritized over the retain filter. So, +/// if a pipeline contains both, then if a chromosome is supplied to +/// [`GeneralIntervalIterator.exclude_seqnames()`], then even if it is in retain, +/// it is skipped. This is to prevent validation and returning a [`Result`]. +/// +/// # Example +/// +/// ``` +/// use granges::prelude::*; +/// +/// let iter = Bed3Iterator::new("tests_data/example.bed") +/// .expect("error reading file") +/// .exclude_seqnames(&vec!["chr1".to_string()]); +/// +/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 }; +/// let gr = GRangesEmpty::from_iter(iter, &seqlens) +/// .expect("parsing error"); +/// let mut iter = gr.iter_ranges(); +/// +/// // the first range should be the third range in the file, +/// // chr2:4-5 +/// assert_eq!(iter.next().unwrap().start, 4); +/// assert_eq!(iter.next().unwrap().end, 6); +/// assert_eq!(iter.next().unwrap().end, 15); +/// assert_eq!(iter.next(), None); +/// ``` +#[derive(Debug)] +pub struct FilteredRanges +where + I: Iterator>, +{ + inner: I, + retain_seqnames: Option>, + exclude_seqnames: Option>, +} + +impl FilteredRanges +where + I: Iterator>, +{ + pub fn new( + inner: I, + retain_seqnames: Option<&Vec>, + exclude_seqnames: Option<&Vec>, + ) -> Self { + let retain_seqnames = retain_seqnames.cloned().map(HashSet::from_iter); + let exclude_seqnames = exclude_seqnames.cloned().map(HashSet::from_iter); + Self { + inner, + retain_seqnames, + exclude_seqnames, + } + } +} + +/// Range-filtering iterator implementation for [`GenomicRangeRecord`]. +impl Iterator for FilteredRanges> +where + I: Iterator, GRangesError>>, +{ + type Item = Result, GRangesError>; + + /// Get the next filtered entry, prioritizing exclude over retain. + fn next(&mut self) -> Option { + for item in self.inner.by_ref() { + match &item { + Ok(entry) => { + if self + .exclude_seqnames + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } + if self + .retain_seqnames + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } + } + Err(_) => return Some(item), + } + } + None + } +} + +/// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`]. +impl Iterator for FilteredRanges +where + I: Iterator>, +{ + type Item = Result; + + /// Get the next filtered entry, prioritizing exclude over retain. + fn next(&mut self) -> Option { + for item in self.inner.by_ref() { + match &item { + Ok(entry) => { + if self + .exclude_seqnames + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } + if self + .retain_seqnames + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } + } + Err(_) => return Some(item), + } + } + None + } +} + +impl GeneralRangeRecordIterator> + for TsvRecordIterator> +where + U: Clone + for<'de> serde::Deserialize<'de>, +{ + fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames(self, seqnames: &[String]) -> FilteredRanges> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +impl GeneralRangeRecordIterator>> for BedlikeIterator { + fn retain_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges>> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges>> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +impl GeneralRangeRecordIterator for Bed3Iterator { + fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +impl GeneralRangeRecordIterator> for Bed4Iterator { + fn retain_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +impl GeneralRangeRecordIterator> for Bed5Iterator { + fn retain_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +impl GeneralRangeRecordIterator> for UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + fn retain_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +#[derive(Debug)] +pub struct UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + inner: I, +} + +impl UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + pub fn new(inner: I) -> Self { + Self { inner } + } +} + +impl Iterator for UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + if let Some(result) = self.inner.next() { + return Some(result.and_then(|record| match record.data { + Some(data) => Ok(GenomicRangeRecord::new( + record.seqname, + record.start, + record.end, + data, + )), + None => Err(GRangesError::TryUnwrapDataError), + })); + } + None + } +} + +impl GenomicRangeRecordUnwrappable for BedlikeIterator { + fn try_unwrap_data(self) -> UnwrappedRanges { + UnwrappedRanges::new(self) + } +} diff --git a/src/io/parsers/mod.rs b/src/io/parsers/mod.rs index 681620e..46ede26 100644 --- a/src/io/parsers/mod.rs +++ b/src/io/parsers/mod.rs @@ -62,568 +62,12 @@ //! pub mod bed; +pub mod detect; +pub mod filters; pub mod tsv; pub mod utils; pub use bed::{Bed3Iterator, Bed5Addition, Bed5Iterator, BedlikeIterator}; +pub use detect::{GenomicRangesFile, GenomicRangesParser}; -use std::collections::HashSet; -use std::io::BufRead; -use std::path::PathBuf; - -use crate::error::GRangesError; -use crate::io::file::InputStream; -use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}; -use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable}; -use crate::Position; - -use self::utils::get_base_extension; - -use super::TsvRecordIterator; - -/// Inspect the first line to check that it looks like a valid BED-like -/// file, i.e. the first column is there (there are no reasonable checks -/// for sequence names other than presence), and the next to columns can -/// be parsed into a [`Position`]. -fn valid_bedlike(input_file: &mut InputStream) -> Result { - let _metadata = input_file.collect_metadata("#", None)?; - let mut reader = input_file.continue_reading()?; - let mut first_line = String::new(); - reader.read_line(&mut first_line)?; - - let columns = first_line - .splitn(4, '\t') - .map(String::from) - .collect::>(); - - if columns.len() < 3 { - // too few columns to be BED-like - return Ok(false); - } - - // Attempt to parse the second and third columns as positions - let start_result = columns[1].trim().parse::(); - let end_result = columns[2].trim().parse::(); - - // Check if both positions are valid - match (start_result, end_result) { - (Ok(_), Ok(_)) => Ok(true), // both are valid - _ => Ok(false), // one or both is not valid - } -} - -// TODO: we can combine this and the above into an enum -fn valid_bed5(input_file: &mut InputStream) -> Result { - let _metadata = input_file.collect_metadata("#", None)?; - let mut reader = input_file.continue_reading()?; - let mut first_line = String::new(); - reader.read_line(&mut first_line)?; - - let columns = first_line - .splitn(4, '\t') - .map(String::from) - .collect::>(); - - if columns.len() == 5 { - // Attempt to parse the second and third columns as positions - let valid_start = columns[1].trim().parse::().is_ok(); - let valid_end = columns[2].trim().parse::().is_ok(); - // this should always be true - let _ = columns[3].trim().parse::().is_ok(); - let valid_strand = columns[4].trim().parse::().is_ok(); - return Ok(valid_start && valid_end && valid_strand); - } - Ok(false) -} - -/// Enum that connects a genomic ranges file type to its specific parser. -#[derive(Debug)] -pub enum GenomicRangesParser { - Bed3(Bed3Iterator), - Bed5(Bed5Iterator), - Bedlike(BedlikeIterator), - Unsupported, -} - -/// Enum that indicates the filetype of some genomic ranges file. -#[derive(Debug, PartialEq)] -pub enum GenomicRangesFile { - Bed3(PathBuf), - Bed5(PathBuf), - Bedlike(PathBuf), - Unsupported, -} - -impl GenomicRangesFile { - /// Detect the type of range genomic range file type we are working with, and output - /// the appropriate [`GenomicRangesFile`] enum variant. - /// - /// Detection works like this: - /// 1. Skip comment lines, starting with `#`. - /// 2. Retrieve extension, removing any additional compression-related - /// extensions (`.gz` and `.bgz`) if present. - /// 3. Read the first line, and try to parse the second and third columns - /// into [`Position`] types, to check if this is a BED-like file (i.e. - /// the first three columns are sequence name, start, and end positions). - /// 4. Match against - /// - /// Currently this supports: - /// 1. BED3 files, without range data. - /// 2. BED-like files, with BED3 first columns and optional additional columns - /// after that. If there is no additional columnar data after the first - /// three BED3 columns, the type is [`GenomicRangesFile::Bed3`]. If there - /// is additional columnar data, it is [`GenomicRangesFile::Bedlike`]. - /// 3. Files with `.tsv` extension but are BED-like (first three - /// columns are BED3) will have the type [`GenomicRangesFile::Bed3`]. This - /// is because downstream [`GRanges`] operations need to know if any - /// additional data is present, which would need to be put in a data container. - /// 4. BED5 files, which are BED3 + a *feature name* and a *strand* column. - /// 5. If the file type does not satisfy any of the rules above, it is - /// [`GenomicRangesFile::Unsupported`]. - /// - /// See the `match` statement in the source code for the exact rules. Additional - /// genomic range file formats like GTF/GFF can easily be added later. - /// - /// [`GRanges`]: crate::granges::GRanges - pub fn detect(filepath: impl Into) -> Result { - let path: PathBuf = filepath.into(); - let mut input_file = InputStream::new(&path); - let _metadata = input_file.collect_metadata("#", None)?; - let number_columns = input_file.detect_columns("\t")?; - - // get the extension, as a hint - let extension = - get_base_extension(&path).ok_or(GRangesError::CouldNotDetectRangesFiletype)?; - - // test if the first row can be parsed into a BED-like file format. - let is_valid_bedlike = valid_bedlike(&mut input_file)?; - let is_valid_bed5 = valid_bed5(&mut input_file)?; // TODO OPTIMIZE merge with above - let file_type = match ( - extension.as_str(), - number_columns, - is_valid_bedlike, - is_valid_bed5, - ) { - ("bed", 3, true, false) => GenomicRangesFile::Bed3(path), - ("tsv", 3, true, false) => GenomicRangesFile::Bed3(path), - ("bed", 5, true, true) => GenomicRangesFile::Bed5(path), - ("bed", n, true, false) if n > 3 => GenomicRangesFile::Bedlike(path), - ("tsv", n, true, false) if n > 3 => GenomicRangesFile::Bedlike(path), - _ => GenomicRangesFile::Unsupported, - }; - Ok(file_type) - } - - /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error - /// if the filetype is not supported. - /// - /// This returns a [`GenomicRangesParser`] enum, since the parsing iterator filetype - /// cannot be known at compile time. - pub fn parsing_iterator( - filepath: impl Clone + Into, - ) -> Result { - let path = filepath.into(); - match Self::detect(path)? { - GenomicRangesFile::Bed3(path) => { - Ok(GenomicRangesParser::Bed3(Bed3Iterator::new(path)?)) - } - GenomicRangesFile::Bed5(path) => { - Ok(GenomicRangesParser::Bed5(Bed5Iterator::new(path)?)) - } - GenomicRangesFile::Bedlike(path) => { - Ok(GenomicRangesParser::Bedlike(BedlikeIterator::new(path)?)) - } - GenomicRangesFile::Unsupported => Err(GRangesError::UnsupportedGenomicRangesFileFormat), - } - } -} - -/// An iterator over a generic "genomic range like " item type `R`, that filters based on sequence name. -/// -/// Note that that the exclude filter is prioritized over the retain filter. So, -/// if a pipeline contains both, then if a chromosome is supplied to -/// [`GeneralIntervalIterator.exclude_seqnames()`], then even if it is in retain, -/// it is skipped. This is to prevent validation and returning a [`Result`]. -/// -/// # Example -/// -/// ``` -/// use granges::prelude::*; -/// -/// let iter = Bed3Iterator::new("tests_data/example.bed") -/// .expect("error reading file") -/// .exclude_seqnames(&vec!["chr1".to_string()]); -/// -/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 }; -/// let gr = GRangesEmpty::from_iter(iter, &seqlens) -/// .expect("parsing error"); -/// let mut iter = gr.iter_ranges(); -/// -/// // the first range should be the third range in the file, -/// // chr2:4-5 -/// assert_eq!(iter.next().unwrap().start, 4); -/// assert_eq!(iter.next().unwrap().end, 6); -/// assert_eq!(iter.next().unwrap().end, 15); -/// assert_eq!(iter.next(), None); -/// ``` -#[derive(Debug)] -pub struct FilteredRanges -where - I: Iterator>, -{ - inner: I, - retain_seqnames: Option>, - exclude_seqnames: Option>, -} - -impl FilteredRanges -where - I: Iterator>, -{ - pub fn new( - inner: I, - retain_seqnames: Option<&Vec>, - exclude_seqnames: Option<&Vec>, - ) -> Self { - let retain_seqnames = retain_seqnames.cloned().map(HashSet::from_iter); - let exclude_seqnames = exclude_seqnames.cloned().map(HashSet::from_iter); - Self { - inner, - retain_seqnames, - exclude_seqnames, - } - } -} - -/// Range-filtering iterator implementation for [`GenomicRangeRecord`]. -impl Iterator for FilteredRanges> -where - I: Iterator, GRangesError>>, -{ - type Item = Result, GRangesError>; - - /// Get the next filtered entry, prioritizing exclude over retain. - fn next(&mut self) -> Option { - for item in self.inner.by_ref() { - match &item { - Ok(entry) => { - if self - .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } - if self - .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } - } - Err(_) => return Some(item), - } - } - None - } -} - -/// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`]. -impl Iterator for FilteredRanges -where - I: Iterator>, -{ - type Item = Result; - - /// Get the next filtered entry, prioritizing exclude over retain. - fn next(&mut self) -> Option { - for item in self.inner.by_ref() { - match &item { - Ok(entry) => { - if self - .exclude_seqnames - .as_ref() - .map_or(false, |ex| ex.contains(&entry.seqname)) - { - continue; - } - if self - .retain_seqnames - .as_ref() - .map_or(true, |rt| rt.contains(&entry.seqname)) - { - return Some(item); - } - } - Err(_) => return Some(item), - } - } - None - } -} - -impl GeneralRangeRecordIterator> - for TsvRecordIterator> -where - U: Clone + for<'de> serde::Deserialize<'de>, -{ - fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges> { - FilteredRanges::new(self, Some(&seqnames.to_vec()), None) - } - fn exclude_seqnames(self, seqnames: &[String]) -> FilteredRanges> { - FilteredRanges::new(self, None, Some(&seqnames.to_vec())) - } -} - -impl GeneralRangeRecordIterator>> for BedlikeIterator { - fn retain_seqnames( - self, - seqnames: &[String], - ) -> FilteredRanges>> { - FilteredRanges::new(self, Some(&seqnames.to_vec()), None) - } - fn exclude_seqnames( - self, - seqnames: &[String], - ) -> FilteredRanges>> { - FilteredRanges::new(self, None, Some(&seqnames.to_vec())) - } -} - -impl GeneralRangeRecordIterator for Bed3Iterator { - fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges { - FilteredRanges::new(self, Some(&seqnames.to_vec()), None) - } - fn exclude_seqnames( - self, - seqnames: &[String], - ) -> FilteredRanges { - FilteredRanges::new(self, None, Some(&seqnames.to_vec())) - } -} - -impl GeneralRangeRecordIterator> for Bed5Iterator { - fn retain_seqnames( - self, - seqnames: &[String], - ) -> FilteredRanges> { - FilteredRanges::new(self, Some(&seqnames.to_vec()), None) - } - fn exclude_seqnames( - self, - seqnames: &[String], - ) -> FilteredRanges> { - FilteredRanges::new(self, None, Some(&seqnames.to_vec())) - } -} - -impl GeneralRangeRecordIterator> for UnwrappedRanges -where - I: Iterator>, GRangesError>>, -{ - fn retain_seqnames( - self, - seqnames: &[String], - ) -> FilteredRanges> { - FilteredRanges::new(self, Some(&seqnames.to_vec()), None) - } - fn exclude_seqnames( - self, - seqnames: &[String], - ) -> FilteredRanges> { - FilteredRanges::new(self, None, Some(&seqnames.to_vec())) - } -} - -#[derive(Debug)] -pub struct UnwrappedRanges -where - I: Iterator>, GRangesError>>, -{ - inner: I, -} - -impl UnwrappedRanges -where - I: Iterator>, GRangesError>>, -{ - pub fn new(inner: I) -> Self { - Self { inner } - } -} - -impl Iterator for UnwrappedRanges -where - I: Iterator>, GRangesError>>, -{ - type Item = Result, GRangesError>; - - fn next(&mut self) -> Option { - if let Some(result) = self.inner.next() { - return Some(result.and_then(|record| match record.data { - Some(data) => Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - data, - )), - None => Err(GRangesError::TryUnwrapDataError), - })); - } - None - } -} - -impl GenomicRangeRecordUnwrappable for BedlikeIterator { - fn try_unwrap_data(self) -> UnwrappedRanges { - UnwrappedRanges::new(self) - } -} - -/// Parses a single column from a string slice into a specified type. -/// -/// This function is particularly useful for converting columns in genomic data files -/// from string representation to a specific type like `usize`, `f64`, etc. -/// -/// # Type Parameters -/// -/// * `T`: The type to which the column should be parsed. It must implement `FromStr`. -/// -/// # Arguments -/// -/// * `column`: The string slice representing the column to be parsed. -/// * `line`: The entire line from which the column was extracted. Used for error reporting. -/// -/// # Errors -/// -/// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`. -pub fn parse_column(column: &str, line: &str) -> Result -where - ::Err: std::fmt::Debug, -{ - column - .parse::() - .map_err(|_| GRangesError::InvalidColumnType { - expected_type: std::any::type_name::().to_string(), // Provides the expected type name - found_value: column.to_string(), - line: line.to_string(), - }) -} - -/// Parses a BED-like TSV format line into its constituent components. -/// -/// BED-like formats contain BED3 columns (chromosome, start, and end) and -/// others. It's an non-standard but extensible (since it's just a TSV), and sensible -/// for a variety of genomic data. -/// -/// # Arguments -/// -/// * `line`: A string slice representing a line in BED3 format. -/// -/// # Errors -/// -/// Returns `GRangesError::InvalidColumnType` if the line does not have enough columns -/// or if any column cannot be correctly parsed. -pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str>), GRangesError> { - let columns: Vec<&str> = line.split('\t').collect(); - if columns.len() < 3 { - return Err(GRangesError::BedTooFewColumns( - columns.len(), - 3, - line.to_string(), - )); - } - - let seqname = parse_column(columns[0], line)?; - let start: Position = parse_column(columns[1], line)?; - let end: Position = parse_column(columns[2], line)?; - - // Collect remaining columns - let additional_columns = columns[3..].to_vec(); - - Ok((seqname, start, end, additional_columns)) -} - -#[cfg(test)] -mod tests { - use crate::io::{ - parsers::{utils::get_base_extension, valid_bedlike}, - InputStream, - }; - - use super::GenomicRangesFile; - - #[test] - fn test_get_base_extension() { - assert_eq!(get_base_extension("test.bed.gz").unwrap(), "bed"); - assert_eq!(get_base_extension("test.bed").unwrap(), "bed"); - assert_eq!(get_base_extension("some/path/test.bed.gz").unwrap(), "bed"); - assert_eq!(get_base_extension("some/path/test.bed").unwrap(), "bed"); - assert_eq!(get_base_extension("some/path/test.gff").unwrap(), "gff"); - assert_eq!(get_base_extension("some/path/test.gff.gz").unwrap(), "gff"); - assert_eq!(get_base_extension("test.gff.gz").unwrap(), "gff"); - assert_eq!(get_base_extension("test.gff").unwrap(), "gff"); - assert_eq!(get_base_extension("test"), None); - assert_eq!(get_base_extension("foo/test"), None); - } - - #[test] - fn test_rangefiletype_detect() { - let range_filetype = GenomicRangesFile::detect("tests_data/example.bed"); - assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bed3(_) - )); - - let range_filetype = GenomicRangesFile::detect("tests_data/example_bedlike.tsv"); - assert!(matches!( - range_filetype.unwrap(), - GenomicRangesFile::Bedlike(_) - )); - } - - #[test] - fn test_valid_bedlike() { - assert_eq!( - valid_bedlike(&mut InputStream::new("tests_data/example.bed")).unwrap(), - true - ); - assert_eq!( - valid_bedlike(&mut InputStream::new("tests_data/invalid_format.bed")).unwrap(), - false - ); - } - - //#[test] - //fn test_parser() { - // // based on this example: https://docs.rs/noodles-bed/latest/noodles_bed/struct.Reader.html#method.records - // let iter = Bed3Iterator::new("tests_data/noodles_example.bed").unwrap(); - - // let seqlens = seqlens! { "sq0" => 10 }; - // let gr: GRanges = GRanges::from_iter(iter, &seqlens).unwrap(); - - // assert_eq!(gr.len(), 2); - - // // let mut gr_iter = gr.iter_ranges(); - // // let first_interval = gr_iter.next().unwrap(); - // // assert_eq!(first_interval.first, 7); - // // assert_eq!(first_interval.last, 12); - // // - // // let second_interval = gr_iter.next().unwrap(); - // // assert_eq!(second_interval.first, 20); - // // assert_eq!(second_interval.last, 33); - //} - - //#[test] - //fn test_invalid_bedlike_iterator() { - // let iter = BedlikeIterator::new("tests_data/invalid.bed").unwrap(); - // let seqlens = seqlens! { "sq0" => 10 }; - // let result: Result, _> = GRanges::from_iter(iter.drop_data(), &seqlens); - - // // note: the Rust LSP thinks this isn't used for some reason, so prefaced with _ - // // to silence warnings. - // let _msg = "column '-1' in 'chr1\t-1\t20'".to_string(); - // assert!(matches!(result, Err(GRangesError::InvalidColumnType(_msg)))); - //} -} +pub use filters::{FilteredRanges, UnwrappedRanges}; diff --git a/src/io/parsers/tsv.rs b/src/io/parsers/tsv.rs index 7c89864..13b6702 100644 --- a/src/io/parsers/tsv.rs +++ b/src/io/parsers/tsv.rs @@ -1,7 +1,7 @@ //! Essential TSV parsing functionality, which wraps the blazingly-fast [`csv`] crate's //! deserialization method using [`serde`]. -use csv::{DeserializeRecordsIntoIter, ReaderBuilder}; +use csv::{DeserializeRecordsIntoIter, Reader, ReaderBuilder}; use flate2::read::GzDecoder; use serde::de::Error as DeError; use serde::{Deserialize, Deserializer}; @@ -12,6 +12,38 @@ use std::str::FromStr; use crate::error::GRangesError; +/// Build a TSV reader which ignores comment lines, works on gzip-compressed +/// files, etc. +/// +/// # ⚠️ Stability +/// +/// This will likely change into a type with methods. +/// +/// # Developers Notes +/// +/// Currently headers are ignored, and not properly passed +/// to the output. If you need this feature prioritized, please submit a +/// GitHub issue. +pub fn build_tsv_reader( + filepath: impl Into, +) -> Result>, GRangesError> { + let filepath = filepath.into(); + let file = File::open(&filepath)?; + let is_gzipped = is_gzipped_file(&filepath)?; + let stream: Box = if is_gzipped { + Box::new(GzDecoder::new(file)) + } else { + Box::new(file) + }; + + let reader = ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .comment(Some(b'#')) + .from_reader(stream); + Ok(reader) +} + /// Deserializes some value of type `t` with some possible missing /// character `missing_chars` into [`Option`]. pub fn deserialize_option_generic<'de, D, T>( @@ -47,7 +79,7 @@ impl std::fmt::Debug for TsvRecordIterator { } /// Check if a file is a gzipped by looking for the magic numbers -fn is_gzipped_file(file_path: impl Into) -> io::Result { +pub fn is_gzipped_file(file_path: impl Into) -> io::Result { let mut file = File::open(file_path.into())?; let mut buffer = [0; 2]; file.read_exact(&mut buffer)?; @@ -68,21 +100,7 @@ where /// E.g. for VCF, it would need to be parsed. pub fn new(filepath: impl Into) -> Result { let filepath = filepath.into(); - - let file = File::open(&filepath)?; - let is_gzipped = is_gzipped_file(&filepath)?; - let stream: Box = if is_gzipped { - Box::new(GzDecoder::new(file)) - } else { - Box::new(file) - }; - - let reader = ReaderBuilder::new() - .delimiter(b'\t') - .has_headers(false) - .comment(Some(b'#')) - .from_reader(stream); - + let reader = build_tsv_reader(filepath)?; let inner = reader.into_deserialize(); Ok(Self { inner }) diff --git a/src/io/parsers/utils.rs b/src/io/parsers/utils.rs index a841036..843f950 100644 --- a/src/io/parsers/utils.rs +++ b/src/io/parsers/utils.rs @@ -1,5 +1,71 @@ use std::path::Path; +use crate::{GRangesError, Position}; + +/// Parses a single column from a string slice into a specified type. +/// +/// This function is particularly useful for converting columns in genomic data files +/// from string representation to a specific type like `usize`, `f64`, etc. +/// +/// # Type Parameters +/// +/// * `T`: The type to which the column should be parsed. It must implement `FromStr`. +/// +/// # Arguments +/// +/// * `column`: The string slice representing the column to be parsed. +/// * `line`: The entire line from which the column was extracted. Used for error reporting. +/// +/// # Errors +/// +/// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`. +pub fn parse_column(column: &str, line: &str) -> Result +where + ::Err: std::fmt::Debug, +{ + column + .parse::() + .map_err(|_| GRangesError::InvalidColumnType { + expected_type: std::any::type_name::().to_string(), // Provides the expected type name + found_value: column.to_string(), + line: line.to_string(), + }) +} + +/// Parses a BED-like TSV format line into its constituent components. +/// +/// BED-like formats contain BED3 columns (chromosome, start, and end) and +/// others. It's an non-standard but extensible (since it's just a TSV), and sensible +/// for a variety of genomic data. +/// +/// # Arguments +/// +/// * `line`: A string slice representing a line in BED3 format. +/// +/// # Errors +/// +/// Returns `GRangesError::InvalidColumnType` if the line does not have enough columns +/// or if any column cannot be correctly parsed. +pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str>), GRangesError> { + let columns: Vec<&str> = line.split('\t').collect(); + if columns.len() < 3 { + return Err(GRangesError::BedTooFewColumns( + columns.len(), + 3, + line.to_string(), + )); + } + + let seqname = parse_column(columns[0], line)?; + let start: Position = parse_column(columns[1], line)?; + let end: Position = parse_column(columns[2], line)?; + + // Collect remaining columns + let additional_columns = columns[3..].to_vec(); + + Ok((seqname, start, end, additional_columns)) +} + /// Get the *base* extension to help infer filetype, which ignores compression-related /// extensions (`.gz` and `.bgz`). pub fn get_base_extension>(filepath: P) -> Option { @@ -31,3 +97,22 @@ pub fn get_base_extension>(filepath: P) -> Option { None } } + +#[cfg(test)] +mod tests { + use super::get_base_extension; + + #[test] + fn test_get_base_extension() { + assert_eq!(get_base_extension("test.bed.gz").unwrap(), "bed"); + assert_eq!(get_base_extension("test.bed").unwrap(), "bed"); + assert_eq!(get_base_extension("some/path/test.bed.gz").unwrap(), "bed"); + assert_eq!(get_base_extension("some/path/test.bed").unwrap(), "bed"); + assert_eq!(get_base_extension("some/path/test.gff").unwrap(), "gff"); + assert_eq!(get_base_extension("some/path/test.gff.gz").unwrap(), "gff"); + assert_eq!(get_base_extension("test.gff.gz").unwrap(), "gff"); + assert_eq!(get_base_extension("test.gff").unwrap(), "gff"); + assert_eq!(get_base_extension("test"), None); + assert_eq!(get_base_extension("foo/test"), None); + } +} diff --git a/src/iterators.rs b/src/iterators.rs index 8e89373..3b70f4b 100644 --- a/src/iterators.rs +++ b/src/iterators.rs @@ -1,17 +1,18 @@ -//! Iterators over genomic ranges and their data. +//! Iterators over genomic ranges. //! -//! Note that *parsing iterators* are in [`parsers`]. +//! Note that *parsing iterators* are in [`parsers`]. These work over //! //! [`parsers`]: crate::io::parsers use genomap::GenomeMap; use crate::{ - ranges::GenomicRangeIndexedRecord, - traits::{GenericRange, IterableRangeContainer, RangeContainer}, + granges::GRanges, + ranges::{GenomicRangeIndexedRecord, GenomicRangeRecord}, + traits::{GenericRange, IndexedDataContainer, IterableRangeContainer, RangeContainer}, }; /// An iterator yielding [`GenomicRangeIndexedRecord`], which store -/// indices to the sequence names and data container. +/// indices to their sequence names and data container elements. /// /// # Developer Notes /// Using indices, rather than references to data items and `&str` directly, @@ -74,9 +75,80 @@ where } } +/// An iterator over [`GenomicRangeRecord`] with the data +/// accessed from a [`IndexedDataContainer`]. +pub struct GRangesRecordIterator<'a, R, T> +where + R: IterableRangeContainer, +{ + seqnames: Vec, + ranges: &'a GenomeMap, + data: &'a T, + current_seqname_index: usize, + current_range_iter: Box::RangeType> + 'a>, +} + +impl<'a, R, T> GRangesRecordIterator<'a, R, T> +where + R: IterableRangeContainer, +{ + pub fn new(granges: &'a GRanges) -> Self { + let current_range_iter = granges.ranges.get_by_index(0).unwrap().iter_ranges(); + Self { + seqnames: granges.seqnames(), + ranges: &granges.ranges, + data: granges.data.as_ref().unwrap(), + current_seqname_index: 0, + current_range_iter, + } + } +} + +impl<'a, R, T> Iterator for GRangesRecordIterator<'a, R, T> +where + R: RangeContainer + IterableRangeContainer, + ::RangeType: GenericRange, + T: IndexedDataContainer, +{ + // no matter what the input type is, we can always + // turn it into this general type. + type Item = GenomicRangeRecord<::OwnedItem>; + + fn next(&mut self) -> Option { + loop { + if let Some(next_range) = self.current_range_iter.next() { + let data = self.data.get_owned(next_range.index().unwrap()); + return Some(GenomicRangeRecord { + // TODO how expensive is cloning here? + seqname: self.seqnames[self.current_seqname_index].clone(), + start: next_range.start(), + end: next_range.end(), + data, + }); + } else { + // try to load another sequence's set of ranges. + self.current_seqname_index += 1; + if self.current_seqname_index >= self.ranges.len() { + // we're out of range container iterators + return None; + } + self.current_range_iter = self + .ranges + .get_by_index(self.current_seqname_index) + .unwrap() + .iter_ranges(); + } + } + } +} + #[cfg(test)] mod tests { - use crate::{ranges::GenomicRangeIndexedRecord, test_utilities::granges_test_case_01}; + use crate::{ + iterators::GRangesRecordIterator, + ranges::{GenomicRangeIndexedRecord, GenomicRangeRecord}, + test_utilities::granges_test_case_01, + }; use super::GRangesIterator; @@ -89,4 +161,26 @@ mod tests { GenomicRangeIndexedRecord::new(0, 0, 5, Some(0)) ); } + + #[test] + fn test_genomic_record_ranges_iterator() { + let gr = granges_test_case_01(); + let mut iter = GRangesRecordIterator::new(&gr); + assert_eq!( + iter.next().unwrap(), + GenomicRangeRecord::new("chr1".to_string(), 0, 5, 1.1) + ); + assert_eq!( + iter.next().unwrap(), + GenomicRangeRecord::new("chr1".to_string(), 4, 7, 8.1) + ); + assert_eq!( + iter.next().unwrap(), + GenomicRangeRecord::new("chr1".to_string(), 10, 17, 10.1) + ); + assert_eq!( + iter.next().unwrap(), + GenomicRangeRecord::new("chr2".to_string(), 10, 20, 3.7) + ); + } } diff --git a/src/lib.rs b/src/lib.rs index a50dbe4..5043309 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -359,6 +359,7 @@ pub mod granges; pub mod io; pub mod iterators; pub mod join; +pub mod merging_iterators; pub mod ranges; pub mod sequences; pub mod test_utilities; @@ -369,6 +370,8 @@ pub mod traits; pub mod commands; pub mod reporting; +pub use crate::error::GRangesError; + /// The main position type in GRanges. /// /// This type is currently an unwrapped [`u32`]. This should handle @@ -406,15 +409,24 @@ pub mod prelude { pub use crate::io::file::read_seqlens; pub use crate::io::tsv::BED_TSV; pub use crate::io::{ - Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, - TsvRecordIterator, + Bed3Iterator, Bed4Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, + GenomicRangesParser, TsvRecordIterator, + }; + pub use crate::join::{ + CombinedJoinData, CombinedJoinDataBothEmpty, CombinedJoinDataLeftEmpty, + CombinedJoinDataRightEmpty, }; pub use crate::data::DatumType; pub use crate::ranges::{ + coitrees::{COITreesEmpty, COITreesIndexed}, try_range, vec::{VecRangesEmpty, VecRangesIndexed}, }; + + pub use crate::merging_iterators::{ + ConditionalMergingResultIterator, MergingEmptyResultIterator, MergingResultIterator, + }; pub use crate::traits::{ AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, GenomicRangeRecordUnwrappable, GenomicRangesTsvSerialize, IndexedDataContainer, diff --git a/src/main/mod.rs b/src/main/mod.rs index 142b90a..373a852 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -3,9 +3,10 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; use granges::{ commands::{ - granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, ProcessingMode, + granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, FilterChroms, + Merge, ProcessingMode, }, - data::operations::Operation, + data::operations::FloatOperation, prelude::GRangesError, Position, PositionOffset, }; @@ -28,6 +29,8 @@ Subcommands: map: Compute the left grouped overlaps between the left genomic ranges and right genomic ranges, and apply one or more operations to the score column of the right BED5 file. + + merge: Merge ranges that are within a minimum distance of each other. windows: Create a set of genomic windows of the specified width (in basepairs), stepping the specified step size (the width, by default). @@ -79,6 +82,7 @@ enum Commands { sort: bool, // TODO add skip_missing here }, + FilterChroms(FilterChroms), /// Filter out the left ranges that do not have overlaps with any /// right ranges. This is a "semi-join" in SQL terminology. Filter { @@ -157,8 +161,8 @@ enum Commands { right: PathBuf, /// Operation - #[clap(short, long, value_parser = clap::value_parser!(Operation), use_value_delimiter = true, value_delimiter = ',')] - func: Vec, + #[clap(short, long, value_parser = clap::value_parser!(FloatOperation), use_value_delimiter = true, value_delimiter = ',')] + func: Vec, /// An optional output file (standard output will be used if not specified) #[arg(short, long)] @@ -169,6 +173,7 @@ enum Commands { #[arg(short, long)] skip_missing: bool, }, + Merge(Merge), /// Create a set of genomic windows ranges using the specified width /// and step size, and output to BED3. /// @@ -240,6 +245,7 @@ fn run() -> Result<(), GRangesError> { output, skip_missing, }) => granges_filter(genome, left, right, output.as_ref(), *skip_missing), + Some(Commands::FilterChroms(filter_chroms)) => filter_chroms.run(), Some(Commands::Flank { genome, bedfile, @@ -301,6 +307,8 @@ fn run() -> Result<(), GRangesError> { *skip_missing, ) } + // NOTE: this is the new API, so clean! + Some(Commands::Merge(merge)) => merge.run(), Some(Commands::Windows { genome, width, diff --git a/src/merging_iterators.rs b/src/merging_iterators.rs new file mode 100644 index 0000000..c298785 --- /dev/null +++ b/src/merging_iterators.rs @@ -0,0 +1,475 @@ +//! Merging iterators +//! +// TODO: these probably could use better names? +// TODO: we should support weighting by overlaps. +use std::cmp::max; + +use crate::{ + error::GRangesError, + ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}, + traits::GenericRange, + PositionOffset, +}; + +// Create a new [`MergingEmptyResultIterator`], which work over data-less +// "empty" ranges ([`GenomicRangeRecordEmpty`] and merge them based on their +// distance or degree of overlap. +pub struct MergingEmptyResultIterator +where + I: IntoIterator>, +{ + last_range: Option, + inner: ::IntoIter, + minimum_distance: PositionOffset, +} + +impl MergingEmptyResultIterator +where + I: IntoIterator>, +{ + pub fn new(inner: I, minimum_distance: PositionOffset) -> Self { + Self { + last_range: None, + inner: inner.into_iter(), + minimum_distance, + } + } +} + +impl Iterator for MergingEmptyResultIterator +where + I: IntoIterator>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + for result in self.inner.by_ref() { + let next_range = match result { + Ok(range) => range, + Err(e) => return Some(Err(e)), + }; + + if let Some(last_range) = &mut self.last_range { + let on_same_chrom = last_range.seqname == next_range.seqname; + if on_same_chrom + && last_range.distance_or_overlap(&next_range) <= self.minimum_distance + { + last_range.end = max(last_range.end, next_range.end); + } else { + let return_range = last_range.clone(); + self.last_range = Some(next_range); + return Some(Ok(return_range)); + } + } else { + self.last_range = Some(next_range); + // If this is the first range, we continue looking for more to potentially merge with. + continue; + } + } + + // If we get here, the inner iterator is exhausted. + // We need to return the last_range if it exists and make sure it's cleared for subsequent calls. + self.last_range.take().map(Ok) + } +} + +/// An iterator over [`Result, GRangesError>`] that +/// merges ranges that are less than some specified `minimum_distance` apart. +/// If `minimum_distance` is negative, it is taken as a minimum overlap width +/// to merge at. +pub struct MergingResultIterator +where + I: IntoIterator, GRangesError>>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, +{ + last_range: Option>, + inner: ::IntoIter, + minimum_distance: PositionOffset, + func: F, + accumulated_data: Vec, +} + +impl MergingResultIterator +where + I: IntoIterator, GRangesError>>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, +{ + pub fn new(inner: I, minimum_distance: PositionOffset, func: F) -> Self { + Self { + last_range: None, + inner: inner.into_iter(), + minimum_distance, + func, + accumulated_data: Vec::new(), + } + } +} + +impl Iterator for MergingResultIterator +where + GenomicRangeRecord: GenericRange, + GenomicRangeRecord: GenericRange, + I: IntoIterator, GRangesError>>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, +{ + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + for next_result in self.inner.by_ref() { + match next_result { + Ok(next_range) => { + if let Some(ref mut last_range) = self.last_range { + let on_same_chrom = last_range.seqname == next_range.seqname; + if on_same_chrom + && last_range.distance_or_overlap(&next_range) <= self.minimum_distance + { + // this range overlaps the last range, so we keep accumulating data + last_range.end = max(last_range.end, next_range.end); + self.accumulated_data.push(next_range.data); + continue; + } else { + // New range does not overlap the last range, so we have to finalize. + // First, run function on accumulated taken data. + let final_data = + (self.func)(std::mem::take(&mut self.accumulated_data)); + let return_range = GenomicRangeRecord { + seqname: last_range.seqname.clone(), + start: last_range.start, + end: last_range.end, + data: final_data, + }; + // Next push this new range's data to the data stack (empty after take) + self.accumulated_data.push(next_range.data.clone()); + self.last_range = Some(next_range); + // Push the new possibly-extend range with accumulated data. + return Some(Ok(return_range)); + } + } else { + // There is no last range -- this must be the first. + self.last_range = Some(GenomicRangeRecord { + seqname: next_range.seqname, + start: next_range.start, + end: next_range.end, + data: next_range.data.clone(), + }); + self.accumulated_data.push(next_range.data); + continue; + } + } + Err(e) => return Some(Err(e)), + } + } + + if let Some(last_range) = self.last_range.take() { + if !self.accumulated_data.is_empty() { + // Finalize any accumulated data + let final_data = (self.func)(std::mem::take(&mut self.accumulated_data)); + return Some(Ok(GenomicRangeRecord { + seqname: last_range.seqname, + start: last_range.start, + end: last_range.end, + data: final_data, + })); + } + } + + None // Return None when all items have been processed + } +} + +/// A merging iterator (over [`Result, GRangesError`] items, +/// i.e. from a parsing iterator) that has an additional merge condition tested +/// by function `G` based on the items. +pub struct ConditionalMergingResultIterator +where + I: IntoIterator, GRangesError>>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, + G: Fn(&GenomicRangeRecord, &GenomicRangeRecord) -> bool, +{ + last_range: Option>, + inner: ::IntoIter, + minimum_distance: PositionOffset, + func: F, + group: G, + accumulated_data: Vec, +} + +impl ConditionalMergingResultIterator +where + I: IntoIterator, GRangesError>>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, + G: Fn(&GenomicRangeRecord, &GenomicRangeRecord) -> bool, +{ + pub fn new(inner: I, minimum_distance: PositionOffset, func: F, group: G) -> Self { + Self { + last_range: None, + inner: inner.into_iter(), + minimum_distance, + func, + group, + accumulated_data: Vec::new(), + } + } +} + +impl Iterator for ConditionalMergingResultIterator +where + GenomicRangeRecord: GenericRange, + GenomicRangeRecord: GenericRange, + I: IntoIterator, GRangesError>>, + U: Clone, + V: Clone, + F: Fn(Vec) -> V, + G: Fn(&GenomicRangeRecord, &GenomicRangeRecord) -> bool, +{ + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + for next_result in self.inner.by_ref() { + match next_result { + Ok(next_range) => { + if let Some(ref mut last_range) = self.last_range { + // If we have an additional group-by merging condition, + // check that. If not set this is always true. + let satifies_groupby = (self.group)(last_range, &next_range); + + let on_same_chrom = last_range.seqname == next_range.seqname; + if on_same_chrom + && satifies_groupby + && last_range.distance_or_overlap(&next_range) <= self.minimum_distance + { + // this range overlaps the last range, so we keep accumulating data + last_range.end = max(last_range.end, next_range.end); + self.accumulated_data.push(next_range.data); + continue; + } else { + // New range does not overlap the last range, so we have to finalize. + // First, run function on accumulated taken data. + let final_data = + (self.func)(std::mem::take(&mut self.accumulated_data)); + let return_range = GenomicRangeRecord { + seqname: last_range.seqname.clone(), + start: last_range.start, + end: last_range.end, + data: final_data, + }; + // Next push this new range's data to the data stack (empty after take) + self.accumulated_data.push(next_range.data.clone()); + self.last_range = Some(next_range); + // Push the new possibly-extend range with accumulated data. + return Some(Ok(return_range)); + } + } else { + // There is no last range -- this must be the first. + self.last_range = Some(GenomicRangeRecord { + seqname: next_range.seqname, + start: next_range.start, + end: next_range.end, + data: next_range.data.clone(), + }); + self.accumulated_data.push(next_range.data); + continue; + } + } + Err(e) => return Some(Err(e)), + } + } + + if let Some(last_range) = self.last_range.take() { + if !self.accumulated_data.is_empty() { + // Finalize any accumulated data + let final_data = (self.func)(std::mem::take(&mut self.accumulated_data)); + return Some(Ok(GenomicRangeRecord { + seqname: last_range.seqname, + start: last_range.start, + end: last_range.end, + data: final_data, + })); + } + } + + None // Return None when all items have been processed + } +} + +#[cfg(test)] +mod tests { + use crate::{ + io::{parsers::Bed5Addition, Bed3Iterator, Bed5Iterator}, + merging_iterators::{ConditionalMergingResultIterator, MergingEmptyResultIterator}, + ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}, + }; + + use super::MergingResultIterator; + + #[test] + fn test_merging_iterators() { + // NOTE/TODO: the file-based test is because we don't have a good + // way yet to go from `granges_test_case_01()` into a parsing-like + // iterator. + let iter = Bed5Iterator::new("tests_data/test_case_01.bed").unwrap(); + + let sum_scores = + |data: Vec| data.iter().map(|bed5| bed5.score.unwrap()).sum::(); + + let merged_iter = MergingResultIterator::new(iter, 0, sum_scores); + + let results: Vec<_> = Result::from_iter(merged_iter).unwrap(); + // dbg!(&results); + + assert_eq!( + results[0], + GenomicRangeRecord::new("chr1".to_string(), 0, 7, 9.2,) + ); + + assert_eq!( + results[1], + GenomicRangeRecord::new("chr1".to_string(), 10, 17, 10.1,) + ); + + assert_eq!( + results[2], + GenomicRangeRecord::new( + "chr2".to_string(), + 10, + 32, + 4.800000000000001, // simplifies float comparison + ) + ); + } + + #[test] + fn test_merging_empty_iterators() { + let iter = Bed3Iterator::new("tests_data/test_case_03.bed").unwrap(); + + let merged_iter = MergingEmptyResultIterator::new(iter, 0); + + let results: Vec<_> = Result::from_iter(merged_iter).unwrap(); + + assert_eq!( + results[0], + GenomicRangeRecordEmpty::new("chr1".to_string(), 0, 7,) + ); + + assert_eq!( + results[1], + GenomicRangeRecordEmpty::new("chr1".to_string(), 10, 17,) + ); + + assert_eq!( + results[2], + GenomicRangeRecordEmpty::new("chr2".to_string(), 10, 32,) + ); + } + + #[test] + fn test_merging_empty_iterators_overlap_neg2() { + let iter = Bed3Iterator::new("tests_data/test_case_03.bed").unwrap(); + + // with -2, we require *at least two* overlapping basepairs. + // chr1 ranges: [0, 5), [4, 7) - these overlap by one, not merged; no others + // chr2 ranges: [10, 20), [18, 32) - these overlap by two, so merged + let merged_iter = MergingEmptyResultIterator::new(iter, -2); + + let results: Vec<_> = Result::from_iter(merged_iter).unwrap(); + + assert_eq!( + results[0], + GenomicRangeRecordEmpty::new("chr1".to_string(), 0, 5,) + ); + + assert_eq!( + results[1], + GenomicRangeRecordEmpty::new("chr1".to_string(), 4, 7,) + ); + + assert_eq!( + results[2], + GenomicRangeRecordEmpty::new("chr1".to_string(), 10, 17,) + ); + + assert_eq!( + results[3], + GenomicRangeRecordEmpty::new("chr2".to_string(), 10, 32,) + ); + } + + #[test] + fn test_merging_empty_iterators_overlap_pos10() { + let iter = Bed3Iterator::new("tests_data/test_case_03.bed").unwrap(); + + // with 10, we should just have two range ranges. + let merged_iter = MergingEmptyResultIterator::new(iter, 10); + + let results: Vec<_> = Result::from_iter(merged_iter).unwrap(); + + assert_eq!( + results[0], + GenomicRangeRecordEmpty::new("chr1".to_string(), 0, 17) + ); + + assert_eq!( + results[1], + GenomicRangeRecordEmpty::new("chr2".to_string(), 10, 32,) + ); + } + + #[test] + fn test_conditional_merging_iterators() { + let iter = Bed5Iterator::new("tests_data/test_case_03.bed").unwrap(); + + let sum_scores = + |data: Vec| data.iter().map(|bed5| bed5.score.unwrap()).sum::(); + + fn group_features( + range_1: &GenomicRangeRecord, + range_2: &GenomicRangeRecord, + ) -> bool { + range_1.data.name == range_2.data.name + } + + let merged_iter = + ConditionalMergingResultIterator::new(iter, 0, sum_scores, group_features); + + let results: Vec<_> = Result::from_iter(merged_iter).unwrap(); + + // unlike unconditional merging iterators, the first ranges here, + // though book-ended, are *not* overlapped because they have different feature + // names + assert_eq!( + results[0], + GenomicRangeRecord::new("chr1".to_string(), 0, 5, 1.1,) + ); + + assert_eq!( + results[1], + GenomicRangeRecord::new("chr1".to_string(), 4, 7, 8.1,) + ); + + assert_eq!( + results[2], + GenomicRangeRecord::new("chr1".to_string(), 10, 17, 10.1,) + ); + + // these *are* merged because they have the same feature name + assert_eq!( + results[3], + GenomicRangeRecord::new( + "chr2".to_string(), + 10, + 32, + 4.800000000000001, // simplifies float comparison + ) + ); + } +} diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 4719b46..a4d4c1d 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -185,6 +185,20 @@ impl GenomicRangeRecord { data, } } + + /// Consume this [`GenomicRangeRecord`], apply a function to its data + /// and return the new [`GenomicRangeRecord`] with different data. + pub fn into_map_data(self, func: F) -> GenomicRangeRecord + where + F: Fn(U) -> V, + { + GenomicRangeRecord { + seqname: self.seqname, + start: self.start, + end: self.end, + data: func(self.data), + } + } } impl GenericRange for GenomicRangeRecord { @@ -249,6 +263,7 @@ impl GenericRangeOperations for GenomicRangeRecord { /// Represents a genomic range entry without data, e.g. from a BED3 parser. #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] pub struct GenomicRangeRecordEmpty { pub seqname: String, pub start: Position, @@ -504,4 +519,31 @@ mod tests { let range_b = RangeEmpty::new(2, 5); assert_eq!(range_a.overlap_width(&range_b), 3); } + + #[test] + fn test_distance_or_overlap() { + // | 0 | 1 | | | + // | | | | | 4 | 5 | + let range_a = RangeEmpty::new(0, 2); + let range_b = RangeEmpty::new(4, 6); + assert_eq!(range_a.distance_or_overlap(&range_b), 2); + + // | 0 | 1 | | | + // | | | 2 | 3 | 4 | 5 | + let range_a = RangeEmpty::new(0, 2); + let range_b = RangeEmpty::new(2, 6); + assert_eq!(range_a.distance_or_overlap(&range_b), 0); + + // | 0 | 1 | 2 | | + // | | | 2 | 3 | 4 | | + let range_a = RangeEmpty::new(1, 3); + let range_b = RangeEmpty::new(2, 5); + assert_eq!(range_a.distance_or_overlap(&range_b), -1); + + // | | 1 | 2 | 3 | 4 | ... + // | | | 2 | 3 | 4 | | + let range_a = RangeEmpty::new(1, 10); + let range_b = RangeEmpty::new(2, 5); + assert_eq!(range_a.distance_or_overlap(&range_b), -3); + } } diff --git a/src/reporting.rs b/src/reporting.rs index 3fd7353..1c09e5b 100644 --- a/src/reporting.rs +++ b/src/reporting.rs @@ -12,11 +12,11 @@ #[allow(unused)] pub struct CommandOutput { value: U, - report: Report, + report: Option, } impl CommandOutput { - pub fn new(value: U, report: Report) -> Self { + pub fn new(value: U, report: Option) -> Self { Self { value, report } } } diff --git a/src/sequences/nucleotide.rs b/src/sequences/nucleotide.rs index b476492..7bcee34 100644 --- a/src/sequences/nucleotide.rs +++ b/src/sequences/nucleotide.rs @@ -459,7 +459,6 @@ mod tests { .region_map(&get_len, "chr2", 0, *chr2_len) .unwrap(); assert_eq!(total_len, *chr2_len as usize); - dbg!(&reference); assert!(reference.is_loaded("chr2")); } diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 9a66f20..e8a7760 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -1,14 +1,14 @@ //! Test cases and test utility functions. //! -use std::path::PathBuf; +use std::{io::BufRead, path::PathBuf}; use crate::{ commands::granges_random_bed, create_granges_with_seqlens, error::GRangesError, granges::GRangesEmpty, - io::parsers::bed::Bed5Addition, + io::{parsers::bed::Bed5Addition, InputStream}, prelude::{GRanges, VecRangesIndexed}, ranges::{ coitrees::COITrees, @@ -199,6 +199,18 @@ pub fn random_bed5file(length: usize) -> NamedTempFile { temp_bedfile } +// "head" a file -- useful for peaking at temp files in failed tests. +pub fn head_file(filepath: impl Into) { + let filepath = filepath.into(); + let file = InputStream::new(filepath); + let mut reader = file.reader().unwrap(); + for _ in 0..10 { + let mut line = String::new(); + reader.read_line(&mut line).unwrap(); + dbg!(&line); + } +} + /// Range test case #1 /// /// This also has a corresponding reference sequence @@ -206,12 +218,12 @@ pub fn random_bed5file(length: usize) -> NamedTempFile { /// /// Ranges: /// - chr1: -/// (0, 5, Some(1.1)) -/// (4, 7, Some(8.1)) -/// (10, 17, Some(10.1)) +/// (0, 5, 1.1) +/// (4, 7, 8.1) +/// (10, 17, 10.1) /// - chr2: -/// (10, 20, Some(3.7)) -/// (18, 32, Some(1.1)) +/// (10, 20, 3.7) +/// (18, 32, 1.1) /// /// Seqlens: { "chr1" => 30, "chr2" => 100 } /// @@ -241,6 +253,27 @@ pub fn granges_test_case_02() -> GRanges> { }, seqlens: { "chr1" => 50, "chr2" => 300 }) } +/// Range test case #3 (modified from #1) +/// +/// This is a test case with Bed5 data. This is to test +/// merging iterators, with grouping by feature name. +/// +/// This matches `tests_data/test_case_03.bed`. +/// +// Seqlens: { "chr1" => 30, "chr2" => 100 } +/// +/// Sum of all elements: 24.1 +/// +pub fn granges_test_case_03() -> GRanges> { + create_granges_with_seqlens!(VecRangesIndexed, Vec, { + "chr1" => [(0, 5, Bed5Addition { name: "a".to_string(), score: Some(1.1) }), + (4, 7, Bed5Addition { name: "b".to_string(), score: Some(8.1) }), + (10, 17, Bed5Addition { name: "c".to_string(), score: Some(10.1) })], + "chr2" => [(10, 20, Bed5Addition { name: "d".to_string(), score: Some(3.7)}), + (18, 32, Bed5Addition { name: "d".to_string(), score: Some(1.1) })] + }, seqlens: { "chr1" => 30, "chr2" => 100 }) +} + #[cfg(feature = "ndarray")] /// Mock numeric `NumericSequences1` data pub fn random_array1_sequences(n: usize) -> GenomeMap> { @@ -281,3 +314,33 @@ pub fn random_array2_sequences(n: usize) -> GenomeMap> { } seqs } + +#[cfg(test)] +mod tests { + use crate::{ + granges::GRanges, io::Bed5Iterator, seqlens, test_utilities::granges_test_case_01, + }; + + #[test] + fn test_test_case_01() { + let genome = seqlens!("chr1" => 30, "chr2" => 100); + // yea we're testing test cases. yo dawg. + let iter = Bed5Iterator::new("tests_data/test_case_01.bed").unwrap(); + + // extract out the score, passing the errors through + // NOTE: this is because we don't have a try_fold iterator + // method setup yet -- we need that, and when we get it, + // we should adjust this + let iter_simplified = iter.map(|result| { + match result { + // this unwrap is for '.' + Ok(record) => Ok(record.into_map_data(|bed5_cols| bed5_cols.score.unwrap())), + Err(e) => Err(e), + } + }); + + // build a new GRanges + let gr = GRanges::from_iter(iter_simplified, &genome).unwrap(); + assert_eq!(gr, granges_test_case_01()); + } +} diff --git a/src/traits.rs b/src/traits.rs index 8e87db3..a2d062b 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -16,7 +16,7 @@ use crate::{ join::LeftGroupedJoin, prelude::VecRangesIndexed, ranges::GenomicRangeRecord, - Position, + Position, PositionOffset, }; /// Traits for [`GRanges`] types that can be modified. @@ -79,6 +79,27 @@ pub trait GenericRange: Clone { overlap_end.saturating_sub(overlap_start) } + fn distance_or_overlap(&self, other: &R) -> PositionOffset { + if self.end() > other.start() && self.start() < other.end() { + // The ranges overlap -- return how much as a negative number + // TODO/OPTIMIZE: is this slow? + let overlap: PositionOffset = self.overlap_width(other).try_into().unwrap(); + -overlap + } else if self.end() == other.start() || self.start() == other.end() { + // The ranges are bookends (adjacent to each other) + 0 + } else { + // The ranges do not overlap and are not bookends; calculate the distance + // If `self` is entirely before `other`, the distance is `other.start() - self.end()`. + // If `self` is entirely after `other`, the distance is `self.start() - other.end()`. + // Using `max` to ensure we always get the positive distance regardless of range order + std::cmp::max( + other.start().saturating_sub(self.end()), + self.start().saturating_sub(other.end()), + ) as PositionOffset + } + } + /// Return a tuple of the range created by an overlap with another range; `None` if no overlap. fn overlap_range(&self, other: &R) -> Option<(Position, Position)> { let overlap_start = std::cmp::max(self.start(), other.start()); @@ -97,6 +118,51 @@ pub trait GenericRange: Clone { } } +/// The [`GenericGenomicRange`] extends sequence name comparison and related +/// functionality to [`GenericRange`]. +// TODO: to do this right and have it be useful we need to push +// the seqname indices lower into parsing, e.g. so this works for key types +// like GenericGenomicRange. +pub trait GenericGenomicRange: GenericRange { + /// Returns the sequence name index associated with the range + fn seqname_index(&self) -> usize; + + /// Calculate how many basepairs overlap this range and another, considering seqname_index + fn genomic_overlap_width(&self, other: &R) -> Position { + if self.seqname_index() != other.seqname_index() { + return 0; // No overlap if seqname_index is different + } + + let overlap_start = std::cmp::max(self.start(), other.start()); + let overlap_end = std::cmp::min(self.end(), other.end()); + + if overlap_start >= overlap_end { + 0 + } else { + overlap_end.saturating_sub(overlap_start) + } + } + + /// Return a tuple of the range created by an overlap with another range, considering seqname_index + fn genomic_overlap_range( + &self, + other: &R, + ) -> Option<(Position, Position)> { + if self.seqname_index() != other.seqname_index() { + return None; // No overlap if seqname_index is different + } + + let overlap_start = std::cmp::max(self.start(), other.start()); + let overlap_end = std::cmp::min(self.end(), other.end()); + + if overlap_start <= overlap_end { + Some((overlap_start, overlap_end)) + } else { + None + } + } +} + /// The [`JoinDataOperations`] trait unifies common operations /// over combined join data types ([`CombinedJoinData`], /// [`CombinedJoinDataBothEmpty`], etc). diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 49ebf7c..9095184 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -2,20 +2,94 @@ use granges::{ commands::granges_random_bed, - io::{parsers::bed::bed_missing, InputStream}, + io::parsers::bed::bed_missing, prelude::{read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator}, ranges::GenomicRangeRecord, test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile}, + Position, }; +use indexmap::IndexMap; use std::{ fs::File, - io::BufRead, path::PathBuf, process::{Command, Stdio}, }; use serde::Deserialize; +/// A macro to ensure that standard out is the same. In some cases +/// this cannot be used, e.g. bedtools and granges have different +/// output float precision, so validation in that case is more involved. +/// Importantly, this also checks that standard out has a length > 0, +/// which can be a sneaky edge case where output "matches" but is not +/// valid since the comparison will pass spuriously. +macro_rules! assert_stdout_eq { + ($left:expr, $right:expr) => { + let left_output = String::from_utf8_lossy(&$left.stdout); + let right_output = String::from_utf8_lossy(&$right.stdout); + + // Check that lengths are greater than 0 + assert!( + !left_output.is_empty() && !right_output.is_empty(), + "One or both outputs are empty" + ); + + // Check that the outputs are equal + assert_eq!(left_output, right_output, "Outputs are not equal"); + }; +} + +fn validate_bedfloats( + bedtools_path: impl Into, + granges_path: impl Into, + genome: &IndexMap, + tol: f64, + context: Option, +) { + let bedtools_path = bedtools_path.into(); + let granges_path = granges_path.into(); + let bedtools_iter = BedlikeIterator::new(bedtools_path).unwrap(); + let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); + + let granges_iter = BedlikeIterator::new(granges_path).unwrap(); + let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); + + let granges_data = granges_gr.take_data().unwrap(); + let granges_data = granges_data.iter().map(|extra_cols| { + // explicitly handle missing, then unwrap (rather than ok(), which is less safe) + if *extra_cols == Some(".".to_string()) { + return None; + } + let score: Result = extra_cols.as_ref().unwrap().parse(); + // dbg!(&extra_cols); + Some(score.unwrap()) + }); + + let bedtools_data = bedtools_gr.take_data().unwrap(); + let bedtools_data = bedtools_data.iter().map(|extra_cols| { + if *extra_cols == Some(".".to_string()) { + return None; + } + let score: f64 = extra_cols.as_ref().unwrap().parse().unwrap(); + Some(score) + }); + assert_eq!(granges_data.len(), bedtools_data.len()); + + granges_data + .zip(bedtools_data) + .for_each(|(gr_val, bd_val)| match (gr_val, bd_val) { + (Some(gr), Some(bd)) => assert!((gr - bd).abs() < tol), + // NOTE: for some sum operations with no data, + // bedtools returns '.' not 0.0. The latter is more correct + // (the sum of the empty set is not NA, it's 0.0). + // This is a shim so tests don't stochastically break + // in this case. + (Some(n), None) if n == 0.0 => (), + (None, None) => (), + _ => panic!("{:?}", (&context, &gr_val, &bd_val)), + }); +} + // adjustable test sizes -- useful also for making very small // so string diffs are easier to compare. #[cfg(not(feature = "bench-big"))] @@ -23,18 +97,7 @@ const BED_LENGTH: usize = 10; #[cfg(feature = "bench-big")] const BED_LENGTH: usize = 1_000_000; -// helpers - -fn head_file(filepath: impl Into) { - let filepath = filepath.into(); - let file = InputStream::new(&filepath); - let mut reader = file.reader().unwrap(); - for _ in 0..10 { - let mut line = String::new(); - reader.read_line(&mut line).unwrap(); - dbg!(&line); - } -} +// -- some validation helpers #[macro_export] macro_rules! assert_float_tol { @@ -73,6 +136,8 @@ macro_rules! assert_option_float_tol { }}; } +// -- main validation tests below -- + #[test] fn test_random_bed3file_filetype_detect() { let random_bedfile_path = temp_bedfile().path().to_path_buf(); @@ -86,7 +151,7 @@ fn test_random_bed3file_filetype_detect() { ) .expect("could not generate random BED file"); - head_file(&random_bedfile_path); + // head_file(&random_bedfile_path); match GenomicRangesFile::detect(random_bedfile_path).unwrap() { GenomicRangesFile::Bed3(_) => (), _ => panic!(": could not detect correct filetype"), @@ -134,10 +199,7 @@ fn test_against_bedtools_slop() { assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); - assert_eq!( - String::from_utf8_lossy(&bedtools_output.stdout), - String::from_utf8_lossy(&granges_output.stdout) - ); + assert_stdout_eq!(bedtools_output, granges_output); } /// Test bedtools intersect -a -b -wa -u @@ -190,10 +252,7 @@ fn test_against_bedtools_intersect_wa() { assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); - assert_eq!( - String::from_utf8_lossy(&bedtools_output.stdout), - String::from_utf8_lossy(&granges_output.stdout) - ); + assert_stdout_eq!(bedtools_output, granges_output); } /// Test bedtools flank -g -i -l 10 -r 20 @@ -245,6 +304,8 @@ fn test_against_bedtools_flank() { let bedtools_str = String::from_utf8_lossy(&bedtools_output.stdout); let granges_str = String::from_utf8_lossy(&granges_output.stdout); + assert!(bedtools_str.len() > 0); + assert!(granges_str.len() > 0); let mut bedtools_ranges: Vec<_> = bedtools_str.split("\n").collect(); let mut granges_ranges: Vec<_> = granges_str.split("\n").collect(); @@ -287,11 +348,7 @@ fn test_against_bedtools_makewindows() { assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); - - assert_eq!( - String::from_utf8_lossy(&bedtools_output.stdout), - String::from_utf8_lossy(&granges_output.stdout) - ); + assert_stdout_eq!(bedtools_output, granges_output); } } } @@ -369,47 +426,13 @@ fn test_against_bedtools_map() { assert!(granges_output.status.success(), "{:?}", granges_output); let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); - - let bedtools_iter = BedlikeIterator::new(bedtools_path.path()).unwrap(); - let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); - - let granges_iter = BedlikeIterator::new(granges_output_file.path().to_path_buf()).unwrap(); - let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); - - let granges_data = granges_gr.take_data().unwrap(); - let granges_data = granges_data.iter().map(|extra_cols| { - // explicitly handle missing, then unwrap (rather than ok(), which is less safe) - if *extra_cols == Some(".".to_string()) { - return None; - } - let score: Result = extra_cols.as_ref().unwrap().parse(); - // dbg!(&extra_cols); - Some(score.unwrap()) - }); - - let bedtools_data = bedtools_gr.take_data().unwrap(); - let bedtools_data = bedtools_data.iter().map(|extra_cols| { - if *extra_cols == Some(".".to_string()) { - return None; - } - let score: f64 = extra_cols.as_ref().unwrap().parse().unwrap(); - Some(score) - }); - assert_eq!(granges_data.len(), bedtools_data.len()); - - granges_data - .zip(bedtools_data) - .for_each(|(gr_val, bd_val)| match (gr_val, bd_val) { - (Some(gr), Some(bd)) => assert!((gr - bd).abs() < 1e-5), - // NOTE: for some sum operations with no data, - // bedtools returns '.' not 0.0. The latter is more correct - // (the sum of the empty set is not NA, it's 0.0). - // This is a shim so tests don't stochastically break - // in this case. - (Some(n), None) if n == 0.0 => (), - (None, None) => (), - _ => panic!("{:?}", (&operation, &gr_val, &bd_val)), - }); + validate_bedfloats( + bedtools_path.path(), + granges_output_file.path().to_path_buf(), + &genome, + 1e-6, + format!("operation: {}", operation).into(), + ); } } @@ -497,7 +520,7 @@ fn test_against_bedtools_map_multiple() { median: Option, } - head_file(&bedtools_path.path()); + // head_file(&bedtools_path.path()); let bedtools_iter = TsvRecordIterator::>::new(bedtools_path.path()).expect("HERE"); let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); @@ -506,6 +529,7 @@ fn test_against_bedtools_map_multiple() { granges_output_file.path().to_path_buf(), ) .unwrap(); + let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); let granges_data = granges_gr.take_data().unwrap(); @@ -527,3 +551,92 @@ fn test_against_bedtools_map_multiple() { assert_option_float_tol!(gr.mean, bd.mean); }); } + +#[test] +fn test_against_bedtools_merge_empty() { + let num_ranges = BED_LENGTH; + let random_bedfile_path = random_bed3file(num_ranges); + let distances = vec![0, 1, 10, 20]; + + for distance in distances { + let bedtools_output = Command::new("bedtools") + .arg("merge") + .arg("-i") + .arg(&random_bedfile_path.path()) + .arg("-d") + .arg(distance.to_string()) + .output() + .expect("bedtools merge failed"); + + let granges_output = Command::new(granges_binary_path()) + .arg("merge") + .arg("--bedfile") + .arg(&random_bedfile_path.path()) + .arg("-d") + .arg(distance.to_string()) + .output() + .expect("granges merge failed"); + + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + assert!(granges_output.status.success(), "{:?}", granges_output); + assert_stdout_eq!(bedtools_output, granges_output); + } +} + +#[test] +fn test_against_bedtools_merge_map() { + let num_ranges = BED_LENGTH; + let bedscores_file = random_bed5file(num_ranges); + let distances = vec![0, 1, 10, 20]; + + let operations = vec!["sum", "min", "max", "mean", "median"]; + + for operation in operations { + for distance in &distances { + let bedtools_path = temp_bedfile(); + let bedtools_output_file = File::create(&bedtools_path).unwrap(); + + let bedtools_output = Command::new("bedtools") + .arg("merge") + .arg("-i") + .arg(&bedscores_file.path()) + .arg("-d") + .arg(distance.to_string()) + .arg("-c") + .arg("5") // the score column + .arg("-o") + .arg(operation) + .stdout(Stdio::from(bedtools_output_file)) + .output() + .expect("bedtools merge failed"); + + let granges_output_file = temp_bedfile(); + let granges_output = Command::new(granges_binary_path()) + .arg("merge") + .arg("--bedfile") + .arg(&bedscores_file.path()) + .arg("-d") + .arg(distance.to_string()) + .arg("--func") + .arg(operation) + .arg("--output") + .arg(granges_output_file.path()) + .output() + .expect("granges merge failed"); + + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + // head_file(granges_output_file.path()); + assert!(granges_output.status.success(), "{:?}", granges_output); + // head_file(bedtools_path.path()); + + let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); + validate_bedfloats( + bedtools_path.path(), + granges_output_file.path().to_path_buf(), + &genome, + 1e-6, + format!("operation: {}, distance: {}", operation, distance).into(), + ); + } + } +} diff --git a/tests_data/example_bedlike.tsv b/tests_data/example_bedlike.tsv index 28891f9..fd3a664 100644 --- a/tests_data/example_bedlike.tsv +++ b/tests_data/example_bedlike.tsv @@ -1,5 +1,5 @@ -chr1 10 20 1.23 -chr1 14 18 3.10 -chr2 4 5 4.11 -chr2 5 6 0.13 -chr4 9 15 0.34 +chr1 10 20 1.23 1.2 10.1 +chr1 14 18 3.10 3.1 0.1 +chr2 4 5 4.11 2.1 1.1 +chr2 5 6 0.13 0.8 8.1 +chr4 9 15 0.35 0.01 0.0 diff --git a/tests_data/test_case_01.bed b/tests_data/test_case_01.bed new file mode 100644 index 0000000..48e1016 --- /dev/null +++ b/tests_data/test_case_01.bed @@ -0,0 +1,5 @@ +chr1 0 5 a 1.1 +chr1 4 7 b 8.1 +chr1 10 17 c 10.1 +chr2 10 20 d 3.7 +chr2 18 32 e 1.1 diff --git a/tests_data/test_case_03.bed b/tests_data/test_case_03.bed new file mode 100644 index 0000000..d7350b5 --- /dev/null +++ b/tests_data/test_case_03.bed @@ -0,0 +1,5 @@ +chr1 0 5 a 1.1 +chr1 4 7 b 8.1 +chr1 10 17 c 10.1 +chr2 10 20 d 3.7 +chr2 18 32 d 1.1