diff --git a/src/commands.rs b/src/commands.rs index dd326cb..337919e 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,19 +1,19 @@ //! Command functions that implement each of the `granges` subcommands. -use std::path::PathBuf; +use std::{path::PathBuf, io, fs::File}; + +use csv::WriterBuilder; use crate::{ - data::{operations::Operation, DatumType}, + data::{operations::Operation, SerializableDatumType}, io::{ parsers::{Bed5Iterator, GenomicRangesParser}, tsv::BED_TSV, - OutputStream, }, prelude::*, - ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord}, + ranges::{operations::adjust_range, GenomicRangeRecordEmpty, GenomicRangeRecord}, reporting::{CommandOutput, Report}, test_utilities::{random_granges, random_granges_mock_bed5}, - traits::TsvSerialize, Position, PositionOffset, }; @@ -57,15 +57,18 @@ pub fn granges_adjust( ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; - // Setup Output stream -- header is None for now (TODO). - let output_stream = output.map_or(OutputStream::new_stdout(None), |file| { - OutputStream::new(file, None) - }); - let mut writer = output_stream.writer()?; + let writer_boxed: Box = match output { + Some(path) => Box::new(File::create(path)?), + None => Box::new(io::stdout()), + }; + + let mut writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_writer(writer_boxed); // For reporting stuff to the user. let mut report = Report::new(); - let mut skipped_ranges = 0; if !sort { @@ -84,16 +87,16 @@ pub fn granges_adjust( let possibly_adjusted_range = adjust_range(range, -both, both, length); if let Some(range_adjusted) = possibly_adjusted_range { - writeln!(writer, "{}", &range_adjusted.to_tsv(&BED_TSV))?; + writer.serialize(range_adjusted)?; } else { skipped_ranges += 1; } if skipped_ranges > 0 { report.add_issue(format!( - "{} ranges were removed because their widths after adjustment were ≤ 0", - skipped_ranges - )) + "{} ranges were removed because their widths after adjustment were ≤ 0", + skipped_ranges + )) } } } else { @@ -158,7 +161,7 @@ pub fn granges_filter( right_path: &PathBuf, output: Option<&PathBuf>, skip_missing: bool, -) -> Result, GRangesError> { + ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); @@ -197,7 +200,7 @@ pub fn granges_filter( right_gr = GRanges::from_iter( right.try_unwrap_data().retain_seqnames(&seqnames), &genome, - )?; + )?; } else { left_gr = GRangesEmpty::from_iter(left, &genome)?; right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; @@ -240,7 +243,7 @@ pub fn granges_filter( right_gr = GRanges::from_iter( right.try_unwrap_data().retain_seqnames(&seqnames), &genome, - )?; + )?; } else { left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; @@ -289,7 +292,7 @@ pub fn granges_flank( output: Option<&PathBuf>, skip_missing: bool, mode: ProcessingMode, -) -> Result, GRangesError> { + ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; @@ -326,11 +329,15 @@ pub fn granges_flank( } }, ProcessingMode::Streaming => { - // Setup Output stream -- header is None for now (TODO). - let output_stream = output.map_or(OutputStream::new_stdout(None), |file| { - OutputStream::new(file, None) - }); - let mut writer = output_stream.writer()?; + let writer_boxed: Box = match output { + Some(path) => Box::new(File::create(path)?), + None => Box::new(io::stdout()), + }; + + let mut writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_writer(writer_boxed); match ranges_iter { // FIXME: code redundancy. But too early now to design traits, etc. @@ -346,7 +353,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::>(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; + writer.serialize(flanking_range)?; } } } else { @@ -358,9 +365,9 @@ pub fn granges_flank( .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; let flanking_ranges = range - .flanking_ranges::(left, right, length); + .flanking_ranges::(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; + writer.serialize(flanking_range)?; } } } @@ -380,7 +387,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::>(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; + writer.serialize(flanking_range)?; } } } else { @@ -392,9 +399,9 @@ pub fn granges_flank( .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; let flanking_ranges = range - .flanking_ranges::(left, right, length); + .flanking_ranges::(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; + writer.serialize(flanking_range)?; } } } @@ -417,7 +424,7 @@ pub fn granges_map( operations: Vec, output: Option<&PathBuf>, skip_missing: bool, -) -> Result, GRangesError> { + ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); let report = Report::new(); @@ -470,8 +477,10 @@ pub fn granges_map( // Run all operations on the scores. operations .iter() - .map(|operation| operation.run(&mut overlap_scores)) - .collect::>() + .map(|operation| { + operation.run(&mut overlap_scores).into_serializable(&BED_TSV) + }) + .collect::>() })?; result_gr.write_to_tsv(output, &BED_TSV)?; @@ -486,7 +495,7 @@ pub fn granges_windows( step: Option, chop: bool, output: Option>, -) -> Result, GRangesError> { + ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?; let report = Report::new(); @@ -500,7 +509,7 @@ pub fn granges_random_bed( output: Option>, sort: bool, bed5: bool, -) -> Result, GRangesError> { + ) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; diff --git a/src/data/mod.rs b/src/data/mod.rs index 15bbd0f..3f6566d 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -1,7 +1,12 @@ //! Data container implementations. //! -use crate::traits::{DataContainer, IntoDatumType}; +use crate::{ + io::TsvConfig, + traits::{DataContainer, IntoDatumType}, +}; +use serde::ser::Serializer; +use serde::Serialize; #[cfg(feature = "ndarray")] pub mod ndarray; @@ -12,9 +17,9 @@ impl DataContainer for Vec {} impl DataContainer for () {} /// These are core supported data types stored in an `enum`, to -/// unify the types that come out of standard operations like -/// `select()`. -#[derive(Debug, Clone)] +/// unify the types that come out of standard operations of +/// heterogeneous output types. +#[derive(Debug, Clone, Serialize)] pub enum DatumType { Float32(f32), Float64(f64), @@ -26,6 +31,39 @@ pub enum DatumType { NoValue, } +impl DatumType { + pub fn into_serializable(self, config: &TsvConfig) -> SerializableDatumType { + SerializableDatumType { + datum: self, + config, + } + } +} + +#[derive(Debug, Clone)] +pub struct SerializableDatumType<'a> { + pub datum: DatumType, + pub config: &'a TsvConfig, +} + +impl<'a> Serialize for SerializableDatumType<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match &self.datum { + DatumType::NoValue => serializer.serialize_str(&self.config.no_value_string), + DatumType::Float32(value) => serializer.serialize_str(&value.to_string()), + DatumType::Float64(value) => serializer.serialize_str(&value.to_string()), + DatumType::String(value) => serializer.serialize_str(value), + DatumType::Integer32(value) => serializer.serialize_str(&value.to_string()), + DatumType::Integer64(value) => serializer.serialize_str(&value.to_string()), + DatumType::Unsigned32(value) => serializer.serialize_str(&value.to_string()), + DatumType::Unsigned64(value) => serializer.serialize_str(&value.to_string()), + } + } +} + impl IntoDatumType for f64 { fn into_data_type(self) -> DatumType { DatumType::Float64(self) diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs index 97799ce..1d52436 100644 --- a/src/data/ndarray.rs +++ b/src/data/ndarray.rs @@ -2,9 +2,8 @@ use crate::error::GRangesError; use crate::granges::GRanges; -use crate::io::TsvConfig; -use crate::traits::{DataContainer, IndexedDataContainer, RangeContainer, TsvSerialize}; -use ndarray::{Array1, Array2, ArrayBase, ArrayView1, Dim, OwnedRepr}; +use crate::traits::{DataContainer, IndexedDataContainer, RangeContainer}; +use ndarray::{Array1, Array2, ArrayView1}; impl DataContainer for Array1 {} impl DataContainer for Array2 {} @@ -210,15 +209,6 @@ where } } -impl TsvSerialize for ArrayBase, Dim<[usize; 1]>> { - fn to_tsv(&self, config: &TsvConfig) -> String { - self.iter() - .map(|x| x.to_tsv(config)) - .collect::>() - .join("\t") - } -} - #[cfg(test)] mod tests { use crate::test_utilities::granges_test_case_01; diff --git a/src/data/operations.rs b/src/data/operations.rs index a6ca8b5..05245a2 100644 --- a/src/data/operations.rs +++ b/src/data/operations.rs @@ -34,11 +34,19 @@ pub fn median(numbers: &mut [F]) -> Option { /// The (subset of) standard `bedtools map` operations. #[derive(Clone, Debug, ValueEnum)] pub enum Operation { + /// Calculate the sum of all values (a set of zero elements has sum 0.0). Sum, + /// Calculate the sum of all values, but set of zero elements is a missing value, not 0.0. + SumNotEmpty, + /// Calculate the minimum of values. Min, + /// Calculate the maximum of values. Max, + /// Calculate the mean of values. Mean, + /// Calculate the median of values. Median, + /// Concatenate all values into a string separated by commas. Collapse, } @@ -53,6 +61,13 @@ impl Operation { let sum: T = data.iter().copied().sum(); sum.into_data_type() } + Operation::SumNotEmpty => { + if data.is_empty() { + return DatumType::NoValue; + } + let sum: T = data.iter().copied().sum(); + sum.into_data_type() + } Operation::Min => { let min = data .iter() diff --git a/src/granges.rs b/src/granges.rs index 394eac8..3c74a36 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -37,14 +37,16 @@ //! [`BedlikeIterator`]: crate::io::parsers::BedlikeIterator //! [`GRanges::into_coitrees`]: crate::granges::GRanges::into_coitrees -use std::{collections::HashSet, path::PathBuf}; +use std::{collections::HashSet, fs::File, io, path::PathBuf}; +use csv::WriterBuilder; use genomap::GenomeMap; use indexmap::IndexMap; +use serde::Serialize; use crate::{ ensure_eq, - io::{tsv::TsvConfig, OutputStream}, + io::tsv::TsvConfig, iterators::GRangesIterator, join::{ CombinedJoinData, CombinedJoinDataBothEmpty, CombinedJoinDataLeftEmpty, @@ -55,12 +57,12 @@ use crate::{ ranges::{ coitrees::{COITrees, COITreesEmpty, COITreesIndexed}, vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, - GenomicRangeEmptyRecord, GenomicRangeRecord, RangeEmpty, RangeIndexed, + GenomicRangeRecord, GenomicRangeRecordEmpty, RangeEmpty, RangeIndexed, }, traits::{ AdjustableGenericRange, AsGRangesRef, GenericRange, GenericRangeOperations, GenomicRangesTsvSerialize, IndexedDataContainer, IterableRangeContainer, LeftOverlaps, - RangeContainer, TsvSerialize, + RangeContainer, }, Position, PositionOffset, }; @@ -215,8 +217,7 @@ impl<'a, C, T> AsGRangesRef<'a, C, T> for GRanges { impl<'a, T> GenomicRangesTsvSerialize<'a, VecRangesIndexed> for GRanges where T: IndexedDataContainer + 'a, - T: TsvSerialize, - ::Item<'a>: TsvSerialize, + ::Item<'a>: Serialize, { /// Write this [`GRanges`] object to a TSV file to `output`, using the [`TsvConfig`] /// specified with `config`. @@ -230,18 +231,25 @@ where output: Option>, config: &TsvConfig, ) -> Result<(), GRangesError> { - // output stream -- header is None for now (TODO) - let output = output.map_or(OutputStream::new_stdout(None), |file| { - OutputStream::new(file, None) - }); - let mut writer = output.writer()?; - - let data_ref = self.data.as_ref().ok_or(GRangesError::NoDataContainer)?; - let seqnames = self.seqnames(); + let writer_boxed: Box = match output { + Some(path) => Box::new(File::create(path.into())?), + None => Box::new(io::stdout()), + }; + + let mut writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(config.headers) + .from_writer(writer_boxed); + for range in self.iter_ranges() { - let record = range.to_record(&seqnames, data_ref); - writeln!(writer, "{}", record.to_tsv(config))?; + let record = range.to_record( + &self.seqnames(), + self.data.as_ref().ok_or(GRangesError::NoDataContainer)?, + ); + writer.serialize(record)?; } + + writer.flush()?; Ok(()) } } @@ -259,18 +267,22 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges output: Option>, config: &TsvConfig, ) -> Result<(), GRangesError> { - // TODO gzip output handling - // output stream -- header is None for now (TODO) - let output = output.map_or(OutputStream::new_stdout(None), |file| { - OutputStream::new(file, None) - }); - let mut writer = output.writer()?; - - let seqnames = self.seqnames(); - for range in self.0.iter_ranges() { - let record = range.to_record_empty::<()>(&seqnames); - writeln!(writer, "{}", record.to_tsv(config))?; + let writer_boxed: Box = match output { + Some(path) => Box::new(File::create(path.into())?), + None => Box::new(io::stdout()), + }; + + let mut writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(config.headers) + .from_writer(writer_boxed); + + for range in self.iter_ranges() { + let record = range.to_record_empty::<()>(&self.seqnames()); + writer.serialize(record)?; } + + writer.flush()?; Ok(()) } } @@ -1182,7 +1194,7 @@ impl GRangesEmpty { seqlens: &IndexMap, ) -> Result, GRangesError> where - I: Iterator>, + I: Iterator>, { let mut gr = GRangesEmpty::new_vec(seqlens); for possible_entry in iter { diff --git a/src/io/file.rs b/src/io/file.rs index f65c2ea..2059dd6 100644 --- a/src/io/file.rs +++ b/src/io/file.rs @@ -134,10 +134,11 @@ impl InputStream { pub fn detect_columns(&mut self, delim: &str) -> Result { let mut skipped_lines = 0; let mut buf_reader = self.reader()?; + let mut line = String::new(); while skipped_lines < self.skip_lines { + buf_reader.read_line(&mut line)?; skipped_lines += 1; } - let mut line = String::new(); buf_reader.read_line(&mut line)?; Ok(line.split(delim).count()) } diff --git a/src/io/parsers/bed.rs b/src/io/parsers/bed.rs index da41900..d1811e3 100644 --- a/src/io/parsers/bed.rs +++ b/src/io/parsers/bed.rs @@ -17,26 +17,27 @@ //! use serde::de::Error as DeError; -use serde::{Deserialize, Deserializer}; +use serde::{Deserialize, Deserializer, Serialize}; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; use crate::error::GRangesError; -use crate::io::tsv::TsvConfig; use crate::io::InputStream; -use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; -use crate::traits::TsvSerialize; +use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}; use crate::Position; use super::parse_column; use super::tsv::TsvRecordIterator; +// for BedlikeIterator only +pub const PARSE_CAPACITY: usize = 512; + /// [`serde`] deserializer for a BED column with a possibly missing value. Note that the [BED /// specification](https://samtools.github.io/hts-specs/BEDv1.pdf) only technically allows `'.'` to /// be used for missing strands, but in practice it can be found to represent /// missing scores, etc too. -pub fn deserialize_bed_missing<'de, D, T>(deserializer: D) -> Result, D::Error> +pub fn bed_missing<'de, D, T>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, T: Deserialize<'de> + FromStr, @@ -53,11 +54,11 @@ where /// * `score`: a score. // TODO/RENAME: maybe since this goes against spec it should // be called Bed5AdditionPermissive? -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] #[serde(deny_unknown_fields)] pub struct Bed5Addition { pub name: String, - #[serde(deserialize_with = "deserialize_bed_missing")] + #[serde(deserialize_with = "bed_missing")] pub score: Option, } @@ -94,7 +95,7 @@ impl Iterator for Bed5Iterator { /// An iterator over BED3 entries (which just contain ranges no data). #[derive(Debug)] pub struct Bed3Iterator { - iter: TsvRecordIterator, + iter: TsvRecordIterator, } impl Bed3Iterator { @@ -106,7 +107,7 @@ impl Bed3Iterator { } impl Iterator for Bed3Iterator { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { self.iter.next() } @@ -119,17 +120,6 @@ pub enum Strand { Reverse, } -impl TsvSerialize for Option { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - match self { - Some(Strand::Forward) => "+".to_string(), - Some(Strand::Reverse) => "-".to_string(), - None => config.no_value_string.to_string(), - } - } -} - /// Deserializes some value of type `t` with some possible missing /// character `missing_chars` into [`Option`]. pub fn deserialize_option_generic<'de, D, T>( @@ -161,32 +151,6 @@ where // } //} -impl TsvSerialize for &Bed5Addition { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - format!( - "{}\t{}", - self.name, - self.score - .as_ref() - .map_or(config.no_value_string.clone(), |x| x.to_string()) - ) - } -} - -impl TsvSerialize for Bed5Addition { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - format!( - "{}\t{}", - self.name, - self.score - .as_ref() - .map_or(config.no_value_string.clone(), |x| x.to_string()) - ) - } -} - ///// The additional three BED6 columns. //#[derive(Clone, Debug)] //pub struct Bed6Addition { @@ -201,6 +165,7 @@ impl TsvSerialize for Bed5Addition { /// string columns to parse. pub struct BedlikeIterator { reader: BufReader>, + line_buffer: String, } impl std::fmt::Debug for BedlikeIterator { @@ -214,10 +179,15 @@ impl BedlikeIterator { /// assumes the first three columns are the sequence name, start (0-indexed and inclusive), /// and end (0-indeed and exclusive) positions. pub fn new(filepath: impl Into) -> Result { - let mut input_file = InputStream::new(filepath); - let _has_metadata = input_file.collect_metadata("#", None); - let reader = input_file.continue_reading()?; - Ok(Self { reader }) + let input_file = InputStream::new(filepath); + // let _has_metadata = input_file.collect_metadata("#", None); + // let reader = input_file.continue_reading()?; + let reader = input_file.reader()?; + let line_buffer = String::with_capacity(PARSE_CAPACITY); + Ok(Self { + reader, + line_buffer, + }) } } @@ -225,14 +195,21 @@ impl Iterator for BedlikeIterator { type Item = Result>, GRangesError>; fn next(&mut self) -> Option { - let mut line = String::new(); - match self.reader.read_line(&mut line) { - Ok(0) => None, - Ok(_) => { - let line = line.trim_end(); - Some((parse_bed_lazy)(line)) + self.line_buffer.clear(); + + loop { + self.line_buffer.clear(); + match self.reader.read_line(&mut self.line_buffer) { + Ok(0) => return None, + Ok(_) => { + if !self.line_buffer.starts_with('#') { + let line = self.line_buffer.trim_end(); + return Some(parse_bed_lazy(line)); + } + // skip the metadata/comment character + } + Err(e) => return Some(Err(GRangesError::IOError(e))), } - Err(e) => Some(Err(GRangesError::IOError(e))), } } } diff --git a/src/io/parsers/mod.rs b/src/io/parsers/mod.rs index 28ab3a6..681620e 100644 --- a/src/io/parsers/mod.rs +++ b/src/io/parsers/mod.rs @@ -73,7 +73,7 @@ use std::path::PathBuf; use crate::error::GRangesError; use crate::io::file::InputStream; -use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; +use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}; use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable}; use crate::Position; @@ -330,11 +330,11 @@ where } /// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`]. -impl Iterator for FilteredRanges +impl Iterator for FilteredRanges where - I: Iterator>, + I: Iterator>, { - type Item = Result; + type Item = Result; /// Get the next filtered entry, prioritizing exclude over retain. fn next(&mut self) -> Option { @@ -391,14 +391,14 @@ impl GeneralRangeRecordIterator>> for BedlikeI } } -impl GeneralRangeRecordIterator for Bed3Iterator { - fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges { +impl GeneralRangeRecordIterator for Bed3Iterator { + fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges { FilteredRanges::new(self, Some(&seqnames.to_vec()), None) } fn exclude_seqnames( self, seqnames: &[String], - ) -> FilteredRanges { + ) -> FilteredRanges { FilteredRanges::new(self, None, Some(&seqnames.to_vec())) } } diff --git a/src/io/parsers/tsv.rs b/src/io/parsers/tsv.rs index 558fd0b..7c89864 100644 --- a/src/io/parsers/tsv.rs +++ b/src/io/parsers/tsv.rs @@ -59,6 +59,13 @@ impl TsvRecordIterator where for<'de> T: Deserialize<'de>, { + /// Create a new TSV reader. By default, this will skip lines that being with + /// `'#'`, since a pseudo-standard is that these indicate metadata, or column + /// headers. + /// + /// # Stability + /// Future versions may parse comment headers or make this an option. + /// E.g. for VCF, it would need to be parsed. pub fn new(filepath: impl Into) -> Result { let filepath = filepath.into(); @@ -73,6 +80,7 @@ where let reader = ReaderBuilder::new() .delimiter(b'\t') .has_headers(false) + .comment(Some(b'#')) .from_reader(stream); let inner = reader.into_deserialize(); diff --git a/src/io/tsv.rs b/src/io/tsv.rs index 41ef952..f287f2f 100644 --- a/src/io/tsv.rs +++ b/src/io/tsv.rs @@ -1,122 +1,20 @@ //! TSV Serializing helpers, functionality, etc. -use crate::{data::DatumType, traits::TsvSerialize}; use lazy_static::lazy_static; lazy_static! { /// The standard BED format TSV configuration. pub static ref BED_TSV: TsvConfig = TsvConfig { no_value_string: ".".to_string(), + headers: false, }; } /// This is an extensible type to handle common /// TSV output configurations, e.g. what to print /// for `None` or [`DatumType::NoValue`]. +#[derive(Debug, Clone)] pub struct TsvConfig { pub no_value_string: String, -} - -impl TsvSerialize for &String { - fn to_tsv(&self, _config: &TsvConfig) -> String { - self.to_string() - } -} - -impl TsvSerialize for String { - fn to_tsv(&self, _config: &TsvConfig) -> String { - self.to_string() - } -} - -impl TsvSerialize for Option { - fn to_tsv(&self, config: &TsvConfig) -> String { - self.as_ref() - .map_or("".to_string(), |x| format!("\t{}", x.to_tsv(config))) - } -} - -impl TsvSerialize for Vec { - fn to_tsv(&self, config: &TsvConfig) -> String { - self.iter() - .map(|x| x.to_tsv(config)) - .collect::>() - .join("\t") - } -} - -impl TsvSerialize for &Vec { - fn to_tsv(&self, config: &TsvConfig) -> String { - self.iter() - .map(|x| x.to_tsv(config)) - .collect::>() - .join("\t") - } -} - -impl TsvSerialize for &f64 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - // TODO precision from config - format!("{}", self).to_string() - } -} - -impl TsvSerialize for f64 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - // TODO precision from config - format!("{}", self).to_string() - } -} - -impl TsvSerialize for &f32 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - // TODO precision from config - format!("{}", self).to_string() - } -} - -impl TsvSerialize for f32 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - // TODO precision from config - format!("{}", self).to_string() - } -} - -impl TsvSerialize for &i64 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - format!("{}", self).to_string() - } -} - -impl TsvSerialize for i64 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - format!("{}", self).to_string() - } -} - -impl TsvSerialize for &i32 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - format!("{}", self).to_string() - } -} - -impl TsvSerialize for i32 { - fn to_tsv(&self, _config: &TsvConfig) -> String { - format!("{}", self).to_string() - } -} - -impl TsvSerialize for DatumType { - fn to_tsv(&self, config: &TsvConfig) -> String { - match self { - DatumType::Float32(val) => val.to_string(), - DatumType::Float64(val) => val.to_string(), - DatumType::String(val) => val.clone(), - DatumType::Integer32(val) => val.to_string(), - DatumType::Integer64(val) => val.to_string(), - DatumType::Unsigned32(val) => val.to_string(), - DatumType::Unsigned64(val) => val.to_string(), - DatumType::NoValue => config.no_value_string.clone(), - } - } + pub headers: bool, } diff --git a/src/lib.rs b/src/lib.rs index 18710f7..a50dbe4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -419,7 +419,7 @@ pub mod prelude { AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, GenomicRangeRecordUnwrappable, GenomicRangesTsvSerialize, IndexedDataContainer, IntoDatumType, IntoIterableRangesContainer, IterableRangeContainer, JoinDataOperations, - LeftOverlaps, TsvSerialize, + LeftOverlaps, }; pub use crate::seqlens; diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 5291922..4719b46 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -2,16 +2,12 @@ //! //! -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use std::ops::Range; use crate::{ error::GRangesError, - io::tsv::TsvConfig, - traits::{ - AdjustableGenericRange, GenericRange, GenericRangeOperations, IndexedDataContainer, - TsvSerialize, - }, + traits::{AdjustableGenericRange, GenericRange, GenericRangeOperations, IndexedDataContainer}, Position, }; @@ -19,7 +15,7 @@ pub mod coitrees; pub mod operations; pub mod vec; -#[derive(Clone, Debug, Default, PartialEq)] +#[derive(Clone, Debug, Default, PartialEq, Deserialize, Serialize)] pub struct RangeEmpty { pub start: Position, pub end: Position, @@ -171,7 +167,7 @@ impl AdjustableGenericRange for RangeIndexed { /// /// This is used as a type for holding a range with associated data directly /// from a parser. -#[derive(Debug, Clone, PartialEq, Deserialize)] +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] pub struct GenomicRangeRecord { pub seqname: String, pub start: Position, @@ -251,62 +247,15 @@ impl GenericRangeOperations for GenomicRangeRecord { } } -impl TsvSerialize for GenomicRangeRecord<()> { - #[allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - format!("{}\t{}\t{}", self.seqname, self.start, self.end) - } -} - -// impl TsvSerialize for GenomicRangeRecord> { -// fn to_tsv(&self) -> String { -// match &self.data { -// None => { -// format!("{}\t{}\t{}", self.seqname, self.start, self.end,) -// } -// Some(data) => { -// format!( -// "{}\t{}\t{}\t{}", -// self.seqname, -// self.start, -// self.end, -// data.to_tsv() -// ) -// } -// } -// } -// } - -impl TsvSerialize for GenomicRangeRecord { - /// Serialize this [`GenomicRangeRecord`] to TSV, using the recursive [`TsvSerialize`] - /// trait method. - /// - /// Note that this does *not* append the newline — it is up to - /// the `U.to_tsv()` trait method implementation. This is design is - /// intentional, since one common use case is when `U` is an [`Option`], - /// which is the type combination commonly used in lazy parsing. In this case, - /// when the data is `None`, that indicates no more columns. - #[allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - format!( - "{}\t{}\t{}\t{}", - self.seqname, - self.start, - self.end, - self.data.to_tsv(config) - ) - } -} - /// Represents a genomic range entry without data, e.g. from a BED3 parser. -#[derive(Debug, Clone, PartialEq, Deserialize)] -pub struct GenomicRangeEmptyRecord { +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +pub struct GenomicRangeRecordEmpty { pub seqname: String, pub start: Position, pub end: Position, } -impl GenomicRangeEmptyRecord { +impl GenomicRangeRecordEmpty { pub fn new(seqname: String, start: Position, end: Position) -> Self { assert!(end > start); Self { @@ -317,7 +266,7 @@ impl GenomicRangeEmptyRecord { } } -impl GenericRange for GenomicRangeEmptyRecord { +impl GenericRange for GenomicRangeRecordEmpty { fn start(&self) -> Position { self.start } @@ -329,7 +278,7 @@ impl GenericRange for GenomicRangeEmptyRecord { } } -impl AdjustableGenericRange for GenomicRangeEmptyRecord { +impl AdjustableGenericRange for GenomicRangeRecordEmpty { fn set_start(&mut self, start: Position) { self.start = start } @@ -338,7 +287,7 @@ impl AdjustableGenericRange for GenomicRangeEmptyRecord { } } -impl GenericRangeOperations for GenomicRangeEmptyRecord { +impl GenericRangeOperations for GenomicRangeRecordEmpty { /// Create flanking regions for this [`GenomicRangeEmptyRecord`] range. fn flanking_ranges( &self, @@ -352,7 +301,7 @@ impl GenericRangeOperations for GenomicRangeEmptyRecord { let flank_end = std::cmp::min(self.start, seqlen); if flank_end > flank_start { let left_flank_region = - GenomicRangeEmptyRecord::new(self.seqname.clone(), flank_start, flank_end); + GenomicRangeRecordEmpty::new(self.seqname.clone(), flank_start, flank_end); flanking.push(left_flank_region); } } @@ -361,7 +310,7 @@ impl GenericRangeOperations for GenomicRangeEmptyRecord { let flank_end = std::cmp::min(self.end + right, seqlen); if flank_end > flank_start { let right_flank_region = - GenomicRangeEmptyRecord::new(self.seqname.clone(), flank_start, flank_end); + GenomicRangeRecordEmpty::new(self.seqname.clone(), flank_start, flank_end); flanking.push(right_flank_region); } } @@ -369,13 +318,6 @@ impl GenericRangeOperations for GenomicRangeEmptyRecord { } } -impl TsvSerialize for GenomicRangeEmptyRecord { - #[allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - format!("{}\t{}\t{}", self.seqname, self.start, self.end) - } -} - /// Represents a range entry, with indices to sequence name and possibly data. #[derive(Debug, Clone, PartialEq)] pub struct GenomicRangeIndexedRecord { @@ -406,7 +348,7 @@ impl GenomicRangeIndexedRecord { data: &'a T, ) -> GenomicRangeRecord<::Item<'a>> where - T: IndexedDataContainer + TsvSerialize, + T: IndexedDataContainer, { let data = data.get_value(self.index().unwrap()); @@ -417,12 +359,11 @@ impl GenomicRangeIndexedRecord { data, } } - pub fn to_record_empty(self, seqnames: &[String]) -> GenomicRangeRecord<()> { - GenomicRangeRecord { + pub fn to_record_empty(self, seqnames: &[String]) -> GenomicRangeRecordEmpty { + GenomicRangeRecordEmpty { seqname: seqnames[self.seqname_index].clone(), start: self.start, end: self.end, - data: (), } } } diff --git a/src/traits.rs b/src/traits.rs index 8fffea5..8e87db3 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -345,9 +345,3 @@ pub trait Sequences { Ok(gr) } } - -/// Defines how to serialize something to TSV. -pub trait TsvSerialize { - // Serialize something to a TSV [`String`]. - fn to_tsv(&self, config: &TsvConfig) -> String; -} diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index e685433..1cf1cb4 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -2,30 +2,96 @@ use granges::{ commands::granges_random_bed, - prelude::{read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile}, + io::{parsers::bed::bed_missing, InputStream}, + prelude::{ + read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator, + }, + ranges::GenomicRangeRecord, test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile}, }; use std::{ fs::File, + io::BufRead, + path::PathBuf, process::{Command, Stdio}, }; +use serde::Deserialize; + +// adjustable test sizes -- useful also for making very small +// so string diffs are easier to compare. +#[cfg(not(feature = "bench-big"))] +const BED_LENGTH: usize = 10; +#[cfg(feature = "bench-big")] +const BED_LENGTH: usize = 1_000_000; + +// helpers + +fn head_file(filepath: impl Into) { + let filepath = filepath.into(); + let file = InputStream::new(&filepath); + let mut reader = file.reader().unwrap(); + for _ in 0..10 { + let mut line = String::new(); + reader.read_line(&mut line).unwrap(); + dbg!(&line); + } +} + +#[macro_export] +macro_rules! assert_float_tol { + ($a:expr, $b:expr) => {{ + let tol = 1e-5; + assert!( + ($a - $b).abs() < tol, + "assertion failed: `(left !== right)`\n left: `{:?}`, right: `{:?}`, tolerance: `{:?}`", + $a, + $b, + tol + ); + }}; +} + +#[macro_export] +macro_rules! assert_option_float_tol { + ($a:expr, $b:expr) => {{ + let tol = 1e-5; + match ($a, $b) { + (Some(a_val), Some(b_val)) => { + assert!( + (a_val - b_val).abs() < tol, + "assertion failed: `(left !== right)`\n left: `{:?}`, right: `{:?}`, tolerance: `{:?}`", + a_val, + b_val, + tol + ); + } + (None, None) => (), // Both are None, so do nothing. + _ => panic!( + "assertion failed: one is None and the other is Some\n left: `{:?}`, right: `{:?}`", + $a, $b + ), + } + }}; +} + #[test] fn test_random_bed3file_filetype_detect() { let random_bedfile_path = temp_bedfile().path().to_path_buf(); granges_random_bed( "tests_data/hg38_seqlens.tsv", - 100_000, + BED_LENGTH, Some(&random_bedfile_path), true, false, ) .expect("could not generate random BED file"); + head_file(&random_bedfile_path); match GenomicRangesFile::detect(random_bedfile_path).unwrap() { GenomicRangesFile::Bed3(_) => (), - _ => panic!("could not detect correct filetype"), + _ => panic!(": could not detect correct filetype"), } } @@ -36,7 +102,7 @@ fn test_against_bedtools_slop() { granges_random_bed( "tests_data/hg38_seqlens.tsv", - 100_000, + BED_LENGTH, Some(&random_bedfile_path), true, false, @@ -234,7 +300,7 @@ fn test_against_bedtools_makewindows() { #[test] fn test_against_bedtools_map() { - let num_ranges = 100_000; + let num_ranges = BED_LENGTH; let width = 1_000_000; #[allow(unused_variables)] let step = 10_000; // can uncomment lines below to test this @@ -314,13 +380,22 @@ fn test_against_bedtools_map() { let granges_data = granges_gr.take_data().unwrap(); let granges_data = granges_data.iter().map(|extra_cols| { - let score: Option = extra_cols.as_ref().unwrap().parse().ok(); - score + // explicitly handle missing, then unwrap (rather than ok(), which is less safe) + if *extra_cols == Some(".".to_string()) { + return None; + } + let score: Result = extra_cols.as_ref().unwrap().parse(); + // dbg!(&extra_cols); + Some(score.unwrap()) }); + let bedtools_data = bedtools_gr.take_data().unwrap(); let bedtools_data = bedtools_data.iter().map(|extra_cols| { - let score: Option = extra_cols.as_ref().unwrap().parse().ok(); - score + if *extra_cols == Some(".".to_string()) { + return None; + } + let score: f64 = extra_cols.as_ref().unwrap().parse().unwrap(); + Some(score) }); assert_eq!(granges_data.len(), bedtools_data.len()); @@ -339,3 +414,118 @@ fn test_against_bedtools_map() { }); } } + +#[test] +fn test_against_bedtools_map_multiple() { + // try multiple operations at once + let num_ranges = BED_LENGTH; + let width = 1_000_000; + #[allow(unused_variables)] + let step = 10_000; // can uncomment lines below to test this + + // make windows + let windows_file = temp_bedfile(); + let granges_windows_output = Command::new(granges_binary_path()) + .arg("windows") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--width") + .arg(width.to_string()) + // .arg("--step") + // .arg(step.to_string()) + .arg("--output") + .arg(windows_file.path()) + .output() + .expect("granges windows failed"); + assert!( + granges_windows_output.status.success(), + "{:?}", + granges_windows_output + ); + + // create the random data BED5 + let bedscores_file = random_bed5file(num_ranges); + + let bedtools_path = temp_bedfile(); + let bedtools_output_file = File::create(&bedtools_path).unwrap(); + + // compare map commands + let bedtools_output = Command::new("bedtools") + .arg("map") + .arg("-a") + .arg(windows_file.path()) + .arg("-b") + .arg(&bedscores_file.path()) + .arg("-c") + .arg("5") + .arg("-o") + .arg("min,max,mean,sum,median") + .stdout(Stdio::from(bedtools_output_file)) + .output() + .expect("bedtools map failed"); + + let granges_output_file = temp_bedfile(); + let granges_output = Command::new(granges_binary_path()) + .arg("map") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg(windows_file.path()) + .arg("--right") + .arg(bedscores_file.path()) + .arg("--func") + .arg("min,max,mean,sum-not-empty,median") + .arg("--output") + .arg(granges_output_file.path()) + .output() + .expect("granges map failed"); + + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + assert!(granges_output.status.success(), "{:?}", granges_output); + + let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); + + #[derive(Deserialize)] + struct Stats { + #[serde(deserialize_with = "bed_missing")] + min: Option, + #[serde(deserialize_with = "bed_missing")] + max: Option, + #[serde(deserialize_with = "bed_missing")] + mean: Option, + #[serde(deserialize_with = "bed_missing")] + sum: Option, + #[serde(deserialize_with = "bed_missing")] + median: Option, + } + + head_file(&bedtools_path.path()); + let bedtools_iter = + TsvRecordIterator::>::new(bedtools_path.path()).expect("HERE"); + let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); + + let granges_iter = TsvRecordIterator::>::new( + granges_output_file.path().to_path_buf(), + ) + .unwrap(); + let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); + + let granges_data = granges_gr.take_data().unwrap(); + let bedtools_data = bedtools_gr.take_data().unwrap(); + + granges_data + .iter() + .zip(bedtools_data.iter()) + .for_each(|(gr, bd)| { + assert_option_float_tol!(gr.min, bd.min); + assert_option_float_tol!(gr.max, bd.max); + + // NOTE: this breaks because with bedools sum, + // zero overlapping ranges = '.' (None), not 0.0. + // Hence, our sum above is sum-not-empty + assert_option_float_tol!(gr.sum, bd.sum); + + assert_option_float_tol!(gr.median, bd.median); + assert_option_float_tol!(gr.mean, bd.mean); + }); +} diff --git a/tests_data/example.bed b/tests_data/example.bed index 0f40e10..6dc6758 100644 --- a/tests_data/example.bed +++ b/tests_data/example.bed @@ -1,3 +1,4 @@ +# surprised header to try to break stuff ;-) chr1 10 20 chr1 14 18 chr2 4 5