From a5c3a00699a76fae74fc2377c56c984ee60374e6 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sat, 24 Feb 2024 23:42:48 -0800 Subject: [PATCH] Prototype of bedtools map working with multiple operators. - New `TsvConfig` type for e.g. missing data string, with `BED_TSV` definition. - Fixed issues with `left_overlaps()`. - New `DatumType` enum for data types. - New `IntoDatumType` trait for conversion. - Fix to `median()`. - `Operation.run()` method for operations. - New AI-generated more helpful errors. - `to_tsv()` now with `TsvConfig` - lazy static dep. added. - Better parsing error messages. - New join data iterators (mostly for error checking. --- Cargo.toml | 1 + src/commands.rs | 90 +++++++++++-------- src/data/mod.rs | 64 +++++++++++++- src/data/operations.rs | 94 +++++++++++--------- src/error.rs | 140 ++++++++++++++++++++++++------ src/granges.rs | 161 +++++++++++++++++++++------------- src/io/mod.rs | 1 + src/io/parsers.rs | 82 +++++++++++++----- src/io/tsv.rs | 72 ++++++++++++++++ src/join.rs | 167 +++++++++++++++++++----------------- src/lib.rs | 4 +- src/main/mod.rs | 35 +++++--- src/ranges/mod.rs | 14 +-- src/sequences/nucleotide.rs | 10 ++- src/traits.rs | 72 +++++++--------- 15 files changed, 685 insertions(+), 322 deletions(-) create mode 100644 src/io/tsv.rs diff --git a/Cargo.toml b/Cargo.toml index 56ff344..d5ab0b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ polars = { version = "0.37.0", optional = true } bytes = "1.5.0" ndarray-npy = { version = "0.8.1", optional = true } num-traits = "0.2.18" +lazy_static = "1.4.0" [features] # cli = [ "clap" ] # TODO make feature diff --git a/src/commands.rs b/src/commands.rs index 84346fb..2499d76 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -3,8 +3,12 @@ use std::path::PathBuf; use crate::{ - data::operations::Operation, - io::{parsers::{GenomicRangesParser, Bed5Iterator}, OutputStream}, + data::{operations::Operation, DatumType}, + io::{ + parsers::{Bed5Iterator, GenomicRangesParser}, + tsv::BED_TSV, + OutputStream, + }, prelude::*, ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord}, reporting::{CommandOutput, Report}, @@ -80,7 +84,7 @@ pub fn granges_adjust( let possibly_adjusted_range = adjust_range(range, -both, both, length); if let Some(range_adjusted) = possibly_adjusted_range { - writeln!(writer, "{}", &range_adjusted.to_tsv())?; + writeln!(writer, "{}", &range_adjusted.to_tsv(&BED_TSV))?; } else { skipped_ranges += 1; } @@ -101,10 +105,11 @@ pub fn granges_adjust( match ranges_iter { GenomicRangesParser::Bed3(iter) => { let gr = GRangesEmpty::from_iter(iter, &genome)?; - gr.adjust_ranges(-both, both).to_tsv(output)? + gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)? } GenomicRangesParser::Bed5(iter) => { - unimplemented!() + let gr = GRanges::from_iter(iter, &genome)?; + gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)? } GenomicRangesParser::Bedlike(iter) => { // Note the call to try_unwrap_data() here: this is because @@ -112,7 +117,7 @@ pub fn granges_adjust( // values means that writing to TSV doesn't have to deal with this (which // always creates headaches). let gr = GRanges::from_iter(iter.try_unwrap_data(), &genome)?; - gr.adjust_ranges(-both, both).to_tsv(output)? + gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)? } GenomicRangesParser::Unsupported => { return Err(GRangesError::UnsupportedGenomicRangesFileFormat) @@ -158,7 +163,7 @@ pub fn granges_filter( let right_iter = GenomicRangesFile::parsing_iterator(right_path)?; // for reporting stuff to the user - let mut report = Report::new(); + let report = Report::new(); match (left_iter, right_iter) { (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => { @@ -176,7 +181,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let semijoin = left_gr.filter_overlaps(&right_gr)?; - semijoin.to_tsv(output)?; + semijoin.to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -198,7 +203,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let semijoin = left_gr.filter_overlaps(&right_gr)?; - semijoin.to_tsv(output)?; + semijoin.to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -218,7 +223,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let semijoin = left_gr.filter_overlaps(&right_gr)?; - semijoin.to_tsv(output)?; + semijoin.to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -241,7 +246,7 @@ pub fn granges_filter( let right_gr = right_gr.into_coitrees()?; let intersection = left_gr.filter_overlaps(&right_gr)?; - intersection.to_tsv(output)?; + intersection.to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -298,7 +303,7 @@ pub fn granges_flank( } else { GRangesEmpty::from_iter(iter, &genome)? }; - gr.flanking_ranges(left, right)?.to_tsv(output)? + gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)? } GenomicRangesParser::Bed5(_iter) => { unimplemented!() @@ -309,7 +314,7 @@ pub fn granges_flank( } else { GRanges::from_iter(iter.try_unwrap_data(), &genome)? }; - gr.flanking_ranges(left, right)?.to_tsv(output)? + gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)? } GenomicRangesParser::Unsupported => { return Err(GRangesError::UnsupportedGenomicRangesFileFormat) @@ -336,7 +341,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::>(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv())?; + writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; } } } else { @@ -350,7 +355,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv())?; + writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; } } } @@ -370,7 +375,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::>(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv())?; + writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; } } } else { @@ -384,7 +389,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::(left, right, length); for flanking_range in flanking_ranges { - writeln!(writer, "{}", &flanking_range.to_tsv())?; + writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?; } } } @@ -398,7 +403,6 @@ pub fn granges_flank( Ok(CommandOutput::new((), report)) } - /// # Developer Notes /// This function is a great way to see GRange's methods in action. pub fn granges_map( @@ -411,7 +415,7 @@ pub fn granges_map( ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); - let mut report = Report::new(); + let report = Report::new(); let left_iter = Bed3Iterator::new(left_path)?; let right_iter = Bed5Iterator::new(right_path)?; @@ -427,22 +431,40 @@ pub fn granges_map( right_gr = GRanges::from_iter(right_iter, &genome)?; } - let right_gr = right_gr.into_coitrees()? - .map_data(|bed5_cols| { - // extract out just the score - bed5_cols.score - })?; - - let left_join = left_gr.left_overlaps(&right_gr)?; + if left_gr.is_empty() { + return Err(GRangesError::NoRows); + } + if right_gr.is_empty() { + return Err(GRangesError::NoRows); + } - let new_column = left_join.map_over_joins(|join_data| { - join_data. - operations.iter().map(|operation| operation.run(data)).collect() + let right_gr = { + // Convert to interval trees for join. + right_gr + .into_coitrees()? + // Select out the score. + .map_data(|bed5_cols| { + // Extract out just the score. + bed5_cols.score + })? + }; + + // Find the overlaps. + let left_join_gr = left_gr.left_overlaps(&right_gr)?; + + // Process all the overlaps. + let result_gr = left_join_gr.map_over_joins(|join_data| { + // Get the "right data" -- the BED5 scores. + let overlap_scores = join_data.right_data; + + // Run all operations on the scores. + operations + .iter() + .map(|operation| operation.run(overlap_scores.as_ref())) + .collect::>() })?; - - // TODO -- map function - // left_join.to_tsv(output)?; + result_gr.to_tsv(output, &BED_TSV)?; Ok(CommandOutput::new((), report)) } @@ -453,7 +475,7 @@ pub fn granges_random_bed( num: usize, output: Option>, sort: bool, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; @@ -463,7 +485,7 @@ pub fn granges_random_bed( gr = gr.sort(); } - gr.to_tsv(output)?; + gr.to_tsv(output, &BED_TSV)?; let report = Report::new(); Ok(CommandOutput::new((), report)) diff --git a/src/data/mod.rs b/src/data/mod.rs index c3ac1cb..532428a 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -1,10 +1,72 @@ //! Data container implementations. //! -use crate::traits::DataContainer; +use crate::traits::{DataContainer, IntoDatumType}; pub mod operations; pub mod vec; +/// These are core supported data types stored in an `enum`, to +/// unify the types that come out of standard operations like +/// `select()`. +#[derive(Debug, Clone)] +pub enum DatumType { + Float32(f32), + Float64(f64), + String(String), + Integer32(i32), + Integer64(i64), + Unsigned32(u32), + Unsigned64(u64), + NoValue, +} + +impl IntoDatumType for f64 { + fn into_data_type(self) -> DatumType { + DatumType::Float64(self) + } +} +impl IntoDatumType for i64 { + fn into_data_type(self) -> DatumType { + DatumType::Integer64(self) + } +} +impl IntoDatumType for String { + fn into_data_type(self) -> DatumType { + DatumType::String(self) + } +} + +impl IntoDatumType for f32 { + fn into_data_type(self) -> DatumType { + DatumType::Float32(self) + } +} + +impl IntoDatumType for i32 { + fn into_data_type(self) -> DatumType { + DatumType::Integer32(self) + } +} + +impl IntoDatumType for u32 { + fn into_data_type(self) -> DatumType { + DatumType::Unsigned32(self) + } +} + +impl IntoDatumType for u64 { + fn into_data_type(self) -> DatumType { + DatumType::Unsigned64(self) + } +} + +// Conversion from field types to `DatumType` +impl From for DatumType { + fn from(item: T) -> Self { + item.into_data_type() + } +} + impl DataContainer for Vec {} impl DataContainer for () {} diff --git a/src/data/operations.rs b/src/data/operations.rs index 7677b33..e03356c 100644 --- a/src/data/operations.rs +++ b/src/data/operations.rs @@ -1,35 +1,32 @@ //! Implementations of various operations on data. //! +use clap::ValueEnum; use num_traits::{Float, ToPrimitive}; use std::iter::Sum; -use crate::error::GRangesError; +use super::DatumType; +use crate::traits::IntoDatumType; /// Calculate the median. /// /// This will clone and turn `numbers` into a `Vec`. -pub fn median(numbers: &[F]) -> F { +pub fn median(numbers: &[F]) -> Option { + if numbers.is_empty() { + return None; + } let mut numbers = numbers.to_vec(); - numbers.sort_unstable(); + numbers.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mid = numbers.len() / 2; if numbers.len() % 2 == 0 { - (numbers[mid - 1] + numbers[mid]) / F::from(2.0).unwrap() + Some((numbers[mid - 1] + numbers[mid]) / F::from(2.0).unwrap()) } else { - numbers[mid] + Some(numbers[mid]) } } -/// A subset of types that represent operation results types. -pub enum OperationResult -where -T: Float, -{ - Float(T), - String(String), -} - /// The (subset of) standard `bedtools map` operations. +#[derive(Clone, Debug, ValueEnum)] pub enum Operation { Sum, Min, @@ -40,36 +37,49 @@ pub enum Operation { } impl Operation { - /// Do a particular (summarizing) operation on some generic data. - pub fn run(&self, data: &[T]) -> Option> - where - T: Float + Sum + ToPrimitive + Ord + Clone + ToString, - { - match self { - Operation::Sum => { + pub fn run(&self, data: &[T]) -> DatumType + where + T: Float + Sum + ToPrimitive + Clone + ToString, + { + match self { + Operation::Sum => { + let sum: T = data.iter().cloned().sum(); + sum.into_data_type() + } + Operation::Min => { + let min = data + .iter() + .filter(|x| x.is_finite()) + .cloned() + .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater)); + min.map_or(DatumType::NoValue, |x| x.into_data_type()) + } + Operation::Max => { + let max = data + .iter() + .filter(|x| x.is_finite()) + .cloned() + .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Less)); + max.map_or(DatumType::NoValue, |x| x.into_data_type()) + } + Operation::Mean => { + if data.is_empty() { + DatumType::NoValue + } else { let sum: T = data.iter().cloned().sum(); - Some(OperationResult::Float(sum)) - } - Operation::Min => data.iter().cloned().min().map(OperationResult::Float), - Operation::Max => data.iter().cloned().max().map(OperationResult::Float), - Operation::Mean => { - if data.is_empty() { - None - } else { - let sum: T = data.iter().cloned().sum(); - let mean = sum / T::from(data.len()).unwrap(); - Some(OperationResult::Float(mean)) - } - } - Operation::Median => Some(OperationResult::Float(median(data))), - Operation::Collapse => { - let collapsed = data - .iter() - .map(|num| num.to_string()) - .collect::>() - .join(", "); - Some(OperationResult::String(collapsed)) + let mean = sum / T::from(data.len()).unwrap(); + mean.into_data_type() } } + Operation::Median => median(data).map_or(DatumType::NoValue, |x| x.into_data_type()), + Operation::Collapse => { + let collapsed = data + .iter() + .map(|num| num.to_string()) + .collect::>() + .join(", "); + DatumType::String(collapsed) + } } + } } diff --git a/src/error.rs b/src/error.rs index 4224ba6..4f21944 100644 --- a/src/error.rs +++ b/src/error.rs @@ -8,62 +8,152 @@ use std::{ }; use thiserror::Error; -/// The [`GRangesError`] defines the standard set of errors that should -/// be passed to the user. +///// The [`GRangesError`] defines the standard set of errors that should +///// be passed to the user. +//#[derive(Debug, Error)] +//pub enum GRangesError { +// // IO related errors +// #[error("File reading eror: {0}")] +// IOError(#[from] std::io::Error), +// +// // File parsing related errors +// #[error("Could not detect genomic ranges filetype from extension.")] +// CouldNotDetectRangesFiletype, +// #[error("Integer parsing error: {0}")] +// ParseIntError(#[from] ParseIntError), +// #[error("Float parsing error: {0}")] +// ParseFloatError(#[from] ParseFloatError), +// #[error("Bed-like file has too few columns. The first three columns must be sequence name, and start and end positions.\nLine: {0}")] +// BedlikeTooFewColumns(String), +// #[error("File has invalid column type entry: {0}")] +// InvalidColumnType(String), +// #[error("Genome file is invalid: {0}")] +// InvalidGenomeFile(String), +// #[error("Invalid BED string: must be either '+', '-', or '.'")] +// InvalidString, +// +// // BedlikeIterator errors +// #[error("GenomicRangeRecord encountered with None data in try_unwrap_data()")] +// TryUnwrapDataError, +// #[error("GenomicRangeRecord.try_unwrap_data() called on TSV with fewer than 4 columns")] +// TooFewColumns, +// +// // Invalid genomic range errors +// #[error("Range invalid: start ({0}) must be greater than end ({1})")] +// InvalidGenomicRange(Position, Position), +// #[error("Range [{0}, {1}] is invalid for sequence of length {2}")] +// InvalidGenomicRangeForSequence(Position, Position, Position), +// #[error("Sequence name '{0}' is not in the ranges container")] +// MissingSequence(String), +// #[error("Error encountered in genomap::GenomeMap")] +// GenomeMapError(#[from] GenomeMapError), +// +// #[error("Invalid GRanges object: no data container.")] +// NoDataContainer, +// +// // Sequences related errors +// #[error("Sequence name '{0}' was not found.")] +// MissingSequenceName(String), +// +// // FASTA/noodles related errors +// #[error("Error encountered in trying to convert bytes to UTF8 string.")] +// FromUtf8Error(#[from] FromUtf8Error), +// +// // Command line tool related errors +// #[error("Unsupported genomic range format")] +// UnsupportedGenomicRangesFileFormat, +// #[error("Command line argument error: {0}")] +// ArgumentError(#[from] clap::error::Error), +// #[error("No such operation: {0}")] +// NoSuchOperation(String), +//} + +// AICODE/NOTE: these are AI-generated from the above human-written ones. #[derive(Debug, Error)] pub enum GRangesError { // IO related errors - #[error("File reading eror: {0}")] + #[error("File reading error: {0}. Please check if the file exists and you have permission to read it.")] IOError(#[from] std::io::Error), // File parsing related errors - #[error("Could not detect genomic ranges filetype from extension.")] + #[error("Could not determine the file type based on its extension. Ensure the file has a standard genomic data extension (.bed, .gff, etc.).")] CouldNotDetectRangesFiletype, - #[error("Integer parsing error: {0}")] + + #[error("Error parsing an integer value: {0}. Check the file for non-integer entries where integers are expected.")] ParseIntError(#[from] ParseIntError), - #[error("Float parsing error: {0}")] + + #[error("Error parsing a floating-point value: {0}. Check the file for non-numeric entries where floating-point numbers are expected.")] ParseFloatError(#[from] ParseFloatError), - #[error("Bed-like file has too few columns. The first three columns must be sequence name, and start and end positions.\nLine: {0}")] - BedlikeTooFewColumns(String), - #[error("File has invalid column type entry: {0}")] - InvalidColumnType(String), - #[error("Genome file is invalid: {0}")] + + #[error( + "The provided BED file has fewer columns ({0}) than expected ({1}). Problematic line:\n{2}" + )] + BedTooFewColumns(usize, usize, String), + + #[error("The provided BED3 file has fewer columns ({0}) than expected (3).\nAt least three columns are needed: sequence name, start, and end positions.\nProblematic line:\n{1}")] + Bed3TooFewColumns(usize, String), + + #[error( + "Invalid column type: expected {expected_type} but got '{found_value}' in line: '{line}'." + )] + InvalidColumnType { + expected_type: String, + found_value: String, + line: String, + }, + + #[error("The input file had no rows.")] + NoRows, + + #[error("The genome file is invalid: {0}. Please verify the file's format and contents.")] InvalidGenomeFile(String), - #[error("Invalid BED string: must be either '+', '-', or '.'")] + + #[error("Invalid BED format detected. Each entry must be '+', '-', or '.' to represent strand information.")] InvalidString, // BedlikeIterator errors - #[error("GenomicRangeRecord encountered with None data in try_unwrap_data()")] + #[error("Attempted to unwrap genomic range data, but none was present. This operation requires data to be associated with each genomic range.")] TryUnwrapDataError, - #[error("GenomicRangeRecord.try_unwrap_data() called on TSV with fewer than 4 columns")] + + #[error("Attempted to unwrap genomic range data from a TSV with fewer than 4 columns. Ensure your TSV includes the necessary genomic range columns plus additional data.")] TooFewColumns, // Invalid genomic range errors - #[error("Range invalid: start ({0}) must be greater than end ({1})")] + #[error("Invalid genomic range specified: start position ({0}) must be less than or equal to the end position ({1}).")] InvalidGenomicRange(Position, Position), - #[error("Range [{0}, {1}] is invalid for sequence of length {2}")] + + #[error("The specified genomic range [{0}, {1}] is invalid for a sequence of length {2}. Adjust the range to fit within the sequence length.")] InvalidGenomicRangeForSequence(Position, Position, Position), - #[error("Sequence name '{0}' is not in the ranges container")] + + #[error("The sequence name '{0}' is not found within the provided ranges container. Check the sequence names for typos or missing entries.")] MissingSequence(String), - #[error("Error encountered in genomap::GenomeMap")] + + #[error("An error was encountered with the underlying genomap::GenomeMap: {0}")] GenomeMapError(#[from] GenomeMapError), - #[error("Invalid GRanges object: no data container.")] + #[error("The GRanges object is invalid because it lacks an associated data container. Ensure data is loaded or associated with the GRanges object before attempting operations that require data.")] NoDataContainer, // Sequences related errors - #[error("Sequence name '{0}' was not found.")] + #[error("The sequence name '{0}' was not found in the dataset. Verify the sequence names and try again.")] MissingSequenceName(String), // FASTA/noodles related errors - #[error("Error encountered in trying to convert bytes to UTF8 string.")] + #[error("An error occurred while converting bytes to a UTF-8 string. This often indicates invalid or corrupted data.")] FromUtf8Error(#[from] FromUtf8Error), // Command line tool related errors - #[error("Unsupported genomic range format")] + #[error("The specified genomic range file format is unsupported. Check the documentation for a list of supported formats.")] UnsupportedGenomicRangesFileFormat, - #[error("Command line argument error: {0}")] + + #[error("There was an error with the provided command line argument: {0}. Review the command line arguments for any mistakes.")] ArgumentError(#[from] clap::error::Error), - #[error("No such operation: {0}")] - NoSuchOperation, + + #[error( + "The requested operation '{0}' does not exist. Check the list of supported operations." + )] + NoSuchOperation(String), + + #[error("No operation was specified. See granges map --help.")] + NoOperationSpecified, } diff --git a/src/granges.rs b/src/granges.rs index 3549d72..e4894f1 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -85,7 +85,7 @@ use indexmap::IndexMap; use crate::{ ensure_eq, - io::OutputStream, + io::{tsv::TsvConfig, OutputStream}, iterators::GRangesIterator, join::{ CombinedJoinData, CombinedJoinDataBothEmpty, CombinedJoinDataLeftEmpty, @@ -225,7 +225,11 @@ where ::Item<'a>: TsvSerialize, { /// Write - fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { + fn to_tsv( + &'a self, + output: Option>, + config: &TsvConfig, + ) -> Result<(), GRangesError> { // output stream -- header is None for now (TODO) let output = output.map_or(OutputStream::new_stdout(None), |file| { OutputStream::new(file, None) @@ -236,7 +240,7 @@ where let seqnames = self.seqnames(); for range in self.iter_ranges() { let record = range.to_record(&seqnames, data_ref); - writeln!(writer, "{}", record.to_tsv())?; + writeln!(writer, "{}", record.to_tsv(config))?; } Ok(()) } @@ -244,7 +248,11 @@ where impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRangesEmpty { /// Output a BED3 file for for this data-less [`GRanges`]. - fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { + fn to_tsv( + &'a self, + output: Option>, + config: &TsvConfig, + ) -> Result<(), GRangesError> { // output stream -- header is None for now (TODO) let output = output.map_or(OutputStream::new_stdout(None), |file| { OutputStream::new(file, None) @@ -254,7 +262,7 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges let seqnames = self.seqnames(); for range in self.0.iter_ranges() { let record = range.to_record_empty::<()>(&seqnames); - writeln!(writer, "{}", record.to_tsv())?; + writeln!(writer, "{}", record.to_tsv(config))?; } Ok(()) } @@ -517,17 +525,20 @@ impl GRanges> { } } - -impl GRanges> -where C: RangeContainer { - /// Consume this [`GRanges>`] object, applying `func` to all elements +impl GRanges> +where + C: RangeContainer, +{ + /// Consume this [`GRanges>`] object, applying `func` to all elements /// in [`Vec`], to return a new [`GRanges>`]. /// - pub fn map_data(mut self, func: F) -> Result>, GRangesError> - where F: Fn(U) -> V { + pub fn map_data(mut self, func: F) -> Result>, GRangesError> + where + F: Fn(U) -> V, + { let left_data = self.take_data()?; let transformed_data = left_data.into_iter().map(func).collect(); - Ok(GRanges { + Ok(GRanges { ranges: self.ranges, data: Some(transformed_data), }) @@ -573,33 +584,34 @@ impl<'a, DL, DR> GRanges> { } } -impl<'a, T> GRanges, T> -where - T: IndexedDataContainer + 'a, - T: TsvSerialize, - ::Item<'a>: TsvSerialize, -{ - /// Write this [`GRanges`] object to a TSV file, using the - /// [`TsvSerialize`] trait methods defiend for the items in `T`. - pub fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { - // output stream -- header is None for now (TODO) - let output = output.map_or(OutputStream::new_stdout(None), |file| { - OutputStream::new(file, None) - }); - let mut writer = output.writer()?; - - let seqnames = self.seqnames(); - let data_ref = self.data.as_ref().ok_or(GRangesError::NoDataContainer)?; - for range in self.iter_ranges() { - let record = range.to_record(&seqnames, data_ref); - writeln!(writer, "{}", record.to_tsv())?; - } - Ok(()) - } -} +//impl<'a, T> GRanges, T> +//where +// T: IndexedDataContainer + 'a, +// T: TsvSerialize, +// ::Item<'a>: TsvSerialize, +//{ +// /// Write this [`GRanges`] object to a TSV file, using the +// /// [`TsvSerialize`] trait methods defined for the items in `T`. +// pub fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError> { +// // output stream -- header is None for now (TODO) +// let output = output.map_or(OutputStream::new_stdout(None), |file| { +// OutputStream::new(file, None) +// }); +// let mut writer = output.writer()?; +// +// let seqnames = self.seqnames(); +// let data_ref = self.data.as_ref().ok_or(GRangesError::NoDataContainer)?; +// for range in self.iter_ranges() { +// let record = range.to_record(&seqnames, data_ref); +// writeln!(writer, "{}", record.to_tsv())?; +// } +// Ok(()) +// } +//} /// [`GRanges::left_overlaps()`] for the left with data, right with data case. -impl<'a, DL: 'a, DR: 'a> LeftOverlaps<'a, GRanges> for GRanges +impl<'a, DL: 'a, DR: 'a> LeftOverlaps<'a, GRanges> + for GRanges where DL: IndexedDataContainer + 'a, DR: IndexedDataContainer + 'a, @@ -609,7 +621,7 @@ where /// Conduct a left overlap join, consuming self and returning a new /// [`GRanges`]. /// - /// The [`JoinData`] container contains the owned left data container and has + /// The [`JoinData`] container contains the owned left data container and has /// a reference to the right data container, as as well as a [`Vec`]. /// - /// The [`JoinData`] container contains the left data container and has + /// The [`JoinData`] container contains the left data container and has /// a reference to the right data container, as as well as a [`Vec LeftOverlaps<'a, GRanges> for GRangesEmpty where - C: RangeContainer, + C: IterableRangeContainer, DR: IndexedDataContainer + 'a, { type Output = GRanges, JoinDataLeftEmpty<'a, DR>>; @@ -703,7 +716,7 @@ where /// Conduct a left overlap join, consuming self and returning a new /// [`GRanges`]. /// - /// The [`JoinDataLeftEmpty`] contains no left data, and a reference to the + /// The [`JoinDataLeftEmpty`] contains no left data, and a reference to the /// right data container, as as well as a [`Vec LeftOverlaps<'a, GRangesEmpty> for GRangesEmpty, + right: &'a GRangesEmpty, ) -> Result { - let gr: GRanges> = GRanges::new_vec(&self.0.seqlens()); + let mut gr: GRanges> = + GRanges::new_vec(&self.0.seqlens()); + + for (seqname, left_ranges) in self.0.ranges.iter() { + for left_range in left_ranges.iter_ranges() { + // Left join: every left range gets a JoinData. + let mut join_data = LeftGroupedJoin::new(&left_range); + if let Some(right_ranges) = right.0.ranges.get(seqname) { + right_ranges.query(left_range.start(), left_range.end(), |right_range| { + // NOTE: right_range is a coitrees::IntervalNode. + join_data.add_right(&left_range, right_range); + }); + } + gr.push_range_with_join(seqname, left_range.start(), left_range.end(), join_data)?; + } + } - // Since there's no data on either side, we essentially return an empty structure + // get the join data out, to transform it to a more informative type + let join_data = gr.take_data()?; let data = JoinDataBothEmpty { - joins: Vec::new(), // No joins possible without data + joins: join_data.joins, }; let ranges = gr.ranges; Ok(GRanges { @@ -775,7 +818,7 @@ where /// e.g. a median `f64` value, a `String` of all overlap right ranges' values concatenated, /// etc. /// - /// # Arugments + /// # Arugments /// * `func`: a function that takes a [`CombinedJoinData`] (which contains /// the associated data for the left range and overlapping right ranges) /// and summarizes it into a new type `V`. @@ -815,7 +858,7 @@ impl GRanges { /// e.g. a median `f64` value, a `String` of all overlap right ranges' values concatenated, /// etc. /// - /// # Arugments + /// # Arugments /// * `func`: a function that takes a [`CombinedJoinDataLeftEmpty`] (which contains /// the associated data for the left range and overlapping right ranges) /// and summarizes it into a new type `V`. @@ -854,7 +897,7 @@ where /// e.g. a median `f64` value, a `String` of all overlap right ranges' values concatenated, /// etc. /// - /// # Arugments + /// # Arugments /// * `func`: a function that takes a [`CombinedJoinDataRightEmpty`] (which contains /// the associated data for the left range and overlapping right ranges) /// and summarizes it into a new type `V`. @@ -892,7 +935,7 @@ where /// e.g. a median `f64` value, a `String` of all overlap right ranges' values concatenated, /// etc. /// - /// # Arugments + /// # Arugments /// * `func`: a function that takes a [`CombinedJoinDataLeftEmpty`] (which contains /// the associated data for the left range and overlapping right ranges) /// and summarizes it into a new type `V`. @@ -918,7 +961,7 @@ where } } -impl<'a, C, T> GRanges +impl GRanges where T: IndexedDataContainer, { @@ -1391,15 +1434,14 @@ mod tests { // get join data let data = joined_results.data.unwrap(); - // TODO fix // check is left join - //assert_eq!(data.len(), windows_len); + assert_eq!(data.len(), windows_len); - //let mut join_iter = data.iter(); - //assert_eq!(join_iter.next().unwrap().num_overlaps(), 2); - //assert_eq!(join_iter.next().unwrap().num_overlaps(), 0); - //assert_eq!(join_iter.next().unwrap().num_overlaps(), 2); - //assert_eq!(join_iter.next().unwrap().num_overlaps(), 1); + let mut join_iter = data.iter(); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 2); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 0); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 2); + assert_eq!(join_iter.next().unwrap().num_overlaps(), 1); // rest are empty TODO should check } @@ -1417,6 +1459,7 @@ mod tests { right_gr.push_range("chr1", 23, 24, 2.9).unwrap(); let right_gr = right_gr.into_coitrees().unwrap(); + // TODO let joined_results = windows.left_overlaps(&right_gr).unwrap(); // joined_results.apply_over_joins(); } diff --git a/src/io/mod.rs b/src/io/mod.rs index 43df647..7362132 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -2,6 +2,7 @@ pub mod file; pub mod parsers; +pub mod tsv; pub use file::{InputStream, OutputStream}; pub use parsers::{ diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 24cfd8f..fa47ec2 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -109,12 +109,23 @@ use std::io::{BufRead, BufReader}; use std::marker::PhantomData; use std::path::{Path, PathBuf}; +use crate::data::DatumType; use crate::error::GRangesError; use crate::io::file::InputStream; use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; -use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, TsvSerialize}; +use crate::traits::{ + GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, Selection, TsvSerialize, +}; use crate::Position; +use super::tsv::TsvConfig; + +// FEATURE/TODO: hints? if not performance cost +// use lazy_static::lazy_static; +//lazy_static! { +// static ref POSITION_HINT: Option = Some("Check that the start position column is a valid integer.".to_string()); +//} + /// Get the *base* extension to help infer filetype, which ignores compression-related /// extensions (`.gz` and `.bgz`). fn get_base_extension>(filepath: P) -> Option { @@ -405,7 +416,8 @@ pub enum Strand { } impl TsvSerialize for Option { - fn to_tsv(&self) -> String { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { match self { Some(Strand::Forward) => "+".to_string(), Some(Strand::Reverse) => "-".to_string(), @@ -421,27 +433,39 @@ pub struct Bed5Addition { pub score: f64, } -/// The additional three BED6 columns. -// TODO: not connectted yet -#[derive(Clone, Debug)] -pub struct Bed6Addition { - pub name: String, - pub score: f64, - pub strand: Option, +impl Selection for &Bed5Addition { + fn select_by_name(&self, name: &str) -> DatumType { + match name { + "name" => DatumType::String(self.name.clone()), + "score" => DatumType::Float64(self.score), + _ => panic!("No item named '{}'", name), + } + } } impl TsvSerialize for &Bed5Addition { - fn to_tsv(&self) -> String { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { format!("{}\t{}", self.name, self.score) } } impl TsvSerialize for Bed5Addition { - fn to_tsv(&self) -> String { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { format!("{}\t{}", self.name, self.score) } } +/// The additional three BED6 columns. +// TODO: not connectted yet +#[derive(Clone, Debug)] +pub struct Bed6Addition { + pub name: String, + pub score: f64, + pub strand: Option, +} + /// A lazy parser for BED5 files. This parses the first three range columns, the feature name, /// and the score, yielding a [`GenomicRangeRecord`] will be set to `None`, since there are no remaining /// string columns to parse. @@ -793,10 +817,13 @@ pub fn parse_column(column: &str, line: &str) -> Result::Err: std::fmt::Debug, { - // NOTE: this is used a lot, and should be benchmarked. column .parse::() - .map_err(|_| GRangesError::InvalidColumnType(format!("column '{}' in '{}'", column, line))) + .map_err(|_| GRangesError::InvalidColumnType { + expected_type: std::any::type_name::().to_string(), // Provides the expected type name + found_value: column.to_string(), + line: line.to_string(), + }) } /// Parses a BED-like TSV format line into its constituent components. @@ -816,7 +843,11 @@ where pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str>), GRangesError> { let columns: Vec<&str> = line.split('\t').collect(); if columns.len() < 3 { - return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); + return Err(GRangesError::BedTooFewColumns( + columns.len(), + 3, + line.to_string(), + )); } let seqname = parse_column(columns[0], line)?; @@ -837,7 +868,10 @@ pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str pub fn parse_bed_lazy(line: &str) -> Result>, GRangesError> { let columns: Vec<&str> = line.splitn(4, '\t').collect(); if columns.len() < 3 { - return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); + return Err(GRangesError::Bed3TooFewColumns( + columns.len(), + line.to_string(), + )); } let seqname = parse_column(columns[0], line)?; @@ -863,7 +897,10 @@ pub fn parse_bed_lazy(line: &str) -> Result>, pub fn parse_bed3(line: &str) -> Result { let columns: Vec<&str> = line.splitn(4, '\t').collect(); if columns.len() < 3 { - return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); + return Err(GRangesError::Bed3TooFewColumns( + columns.len(), + line.to_string(), + )); } let seqname = parse_column(columns[0], line)?; @@ -877,6 +914,7 @@ pub fn parse_bed3(line: &str) -> Result { }) } +#[allow(dead_code)] fn parse_strand(symbol: char) -> Result, GRangesError> { match symbol { '+' => Ok(Some(Strand::Forward)), @@ -890,9 +928,13 @@ fn parse_strand(symbol: char) -> Result, GRangesError> { /// columns /// pub fn parse_bed5(line: &str) -> Result, GRangesError> { - let columns: Vec<&str> = line.splitn(4, '\t').collect(); - if columns.len() < 3 { - return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); + let columns: Vec<&str> = line.splitn(6, '\t').collect(); + if columns.len() < 5 { + return Err(GRangesError::BedTooFewColumns( + columns.len(), + 5, + line.to_string(), + )); } let seqname = parse_column(columns[0], line)?; @@ -900,7 +942,7 @@ pub fn parse_bed5(line: &str) -> Result, GRange let end: Position = parse_column(columns[2], line)?; let name = parse_column(columns[3], line)?; - let score: f64 = parse_column(columns[3], line)?; + let score: f64 = parse_column(columns[4], line)?; let data = Bed5Addition { name, score }; diff --git a/src/io/tsv.rs b/src/io/tsv.rs new file mode 100644 index 0000000..dc0429f --- /dev/null +++ b/src/io/tsv.rs @@ -0,0 +1,72 @@ +//! TSV Serializing helpers, functionality, etc. + +use crate::{data::DatumType, traits::TsvSerialize}; +use lazy_static::lazy_static; + +lazy_static! { + /// The standard BED format TSV configuration. + pub static ref BED_TSV: TsvConfig = TsvConfig { + no_value_string: ".".to_string(), + }; +} + +/// This is an extensible type to handle common +/// TSV output configurations, e.g. what to print +/// for `None` or [`DatumType::NoValue`]. +pub struct TsvConfig { + pub no_value_string: String, +} + +impl TsvSerialize for &String { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + self.to_string() + } +} + +impl TsvSerialize for String { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + self.to_string() + } +} + +impl TsvSerialize for Option { + fn to_tsv(&self, config: &TsvConfig) -> String { + self.as_ref() + .map_or("".to_string(), |x| format!("\t{}", x.to_tsv(config))) + } +} + +impl TsvSerialize for Vec { + fn to_tsv(&self, config: &TsvConfig) -> String { + self.iter() + .map(|x| x.to_tsv(config)) + .collect::>() + .join("\t") + } +} + +impl TsvSerialize for &Vec { + fn to_tsv(&self, config: &TsvConfig) -> String { + self.iter() + .map(|x| x.to_tsv(config)) + .collect::>() + .join("\t") + } +} + +impl TsvSerialize for DatumType { + fn to_tsv(&self, config: &TsvConfig) -> String { + match self { + DatumType::Float32(val) => val.to_string(), + DatumType::Float64(val) => val.to_string(), + DatumType::String(val) => val.clone(), + DatumType::Integer32(val) => val.to_string(), + DatumType::Integer64(val) => val.to_string(), + DatumType::Unsigned32(val) => val.to_string(), + DatumType::Unsigned64(val) => val.to_string(), + DatumType::NoValue => config.no_value_string.clone(), + } + } +} diff --git a/src/join.rs b/src/join.rs index 348814d..d68cd76 100644 --- a/src/join.rs +++ b/src/join.rs @@ -110,33 +110,33 @@ impl<'a, DL, DR> JoinData<'a, DL, DR> { self.len() == 0 } - ///// Create an iterator over the joins. - //pub fn iter(&'a self) -> JoinDataIterator<'a, DL, DR> { - // JoinDataIterator { - // inner: self.joins.iter(), - // left_data: self.left_data.as_ref(), - // right_data: self.right_data, - // } - //} + /// Create an iterator over the joins. + pub fn iter(&'a self) -> JoinDataIterator<'a, DL, DR> { + JoinDataIterator { + inner: self.joins.iter(), + left_data: Some(&self.left_data), + right_data: Some(self.right_data), + } + } } pub struct CombinedJoinData { - pub join: LeftGroupedJoin, // Information on the join - pub left_data: DL, // The left data element - pub right_data: Vec, // The right data elements + pub join: LeftGroupedJoin, // Information on the join + pub left_data: DL, // The left data element + pub right_data: Vec, // The right data elements } impl JoinDataOperations for CombinedJoinData { type LeftDataElementType = DL; type RightDataElementType = DR; - fn join(&self) -> LeftGroupedJoin { - self.join + fn join(&self) -> &LeftGroupedJoin { + &self.join } - fn left_data(&self) -> Option { - Some(self.left_data) + fn left_data(&self) -> Option<&Self::LeftDataElementType> { + Some(&self.left_data) } - fn right_data(&self) -> Option> { - Some(self.right_data) + fn right_data(&self) -> Option<&Vec> { + Some(&self.right_data) } } @@ -156,20 +156,18 @@ where >, ) -> V, { - // Cloning `left_data` and `right_data` to ensure they live long enough. - // This might not be the most efficient but ensures lifetime correctness. - self.joins .iter() .map(|join| { let left_data = self.left_data.get_owned(join.left.unwrap()); let right_data = join - .rights.as_ref() + .rights + .as_ref() .unwrap() .iter() .map(|idx| self.right_data.get_owned(*idx)) .collect(); - // Now `func` is applied to each `CombinedJoinData` + func(CombinedJoinData { join: join.clone(), left_data, @@ -209,6 +207,15 @@ impl<'a, DR> JoinDataLeftEmpty<'a, DR> { pub fn is_empty(&self) -> bool { self.len() == 0 } + + /// Create an iterator over the joins. + pub fn iter(&'a self) -> JoinDataIterator<'a, (), DR> { + JoinDataIterator { + inner: self.joins.iter(), + left_data: None, + right_data: Some(self.right_data), + } + } } pub struct CombinedJoinDataLeftEmpty { @@ -219,18 +226,17 @@ pub struct CombinedJoinDataLeftEmpty { impl JoinDataOperations<(), DR> for CombinedJoinDataLeftEmpty { type LeftDataElementType = (); type RightDataElementType = DR; - fn join(&self) -> LeftGroupedJoin { - self.join + fn join(&self) -> &LeftGroupedJoin { + &self.join } - fn left_data(&self) -> Option { + fn left_data(&self) -> Option<&Self::LeftDataElementType> { None } - fn right_data(&self) -> Option> { - Some(self.right_data) + fn right_data(&self) -> Option<&Vec> { + Some(&self.right_data) } } - impl<'a, DR> JoinDataLeftEmpty<'a, DR> where DR: IndexedDataContainer + 'a, @@ -241,19 +247,19 @@ where where F: Fn(CombinedJoinDataLeftEmpty<::OwnedItem>) -> V, { - // Cloning `left_data` and `right_data` to ensure they live long enough. - // This might not be the most efficient but ensures lifetime correctness. - + // TODO/OPTIMIZE: would consuming here (and analagous funcs) be better/faster? + // Would require a bit of a redesign. self.joins .iter() .map(|join| { - let right_data = join - .rights.as_ref() - .unwrap() - .iter() - .map(|idx| self.right_data.get_owned(*idx)) - .collect(); - // Now `func` is applied to each `CombinedJoinDataLeftEmpty` + let right_indices = join.rights.as_ref(); + let right_data = right_indices.map_or(Vec::new(), |indices| { + indices + .iter() + .map(|idx| self.right_data.get_owned(*idx)) + .collect() + }); + func(CombinedJoinDataLeftEmpty { join: join.clone(), right_data, @@ -292,6 +298,15 @@ impl<'a, DL> JoinDataRightEmpty
{ pub fn is_empty(&self) -> bool { self.len() == 0 } + + /// Create an iterator over the joins. + pub fn iter(&'a self) -> JoinDataIterator<'a, DL, ()> { + JoinDataIterator { + inner: self.joins.iter(), + left_data: Some(&self.left_data), + right_data: None, + } + } } pub struct CombinedJoinDataRightEmpty
{ @@ -299,23 +314,20 @@ pub struct CombinedJoinDataRightEmpty
{ pub left_data: DL, // The right data element } - impl
JoinDataOperations for CombinedJoinDataRightEmpty
{ type LeftDataElementType = DL; type RightDataElementType = (); - fn join(&self) -> LeftGroupedJoin { - self.join + fn join(&self) -> &LeftGroupedJoin { + &self.join } - fn left_data(&self) -> Option { - Some(self.left_data) + fn left_data(&self) -> Option<&Self::LeftDataElementType> { + Some(&self.left_data) } - fn right_data(&self) -> Option> { + fn right_data(&self) -> Option<&Vec> { None } } - - impl<'a, DL> JoinDataRightEmpty
where DL: IndexedDataContainer, @@ -326,14 +338,11 @@ where where F: Fn(CombinedJoinDataRightEmpty<
::OwnedItem>) -> V, { - // Cloning `left_data` and `right_data` to ensure they live long enough. - // This might not be the most efficient but ensures lifetime correctness. - self.joins .iter() .map(|join| { let left_data = self.left_data.get_owned(join.left.unwrap()); - // Now `func` is applied to each `CombinedJoinDataRightEmpty` + func(CombinedJoinDataRightEmpty { join: join.clone(), left_data, @@ -371,28 +380,35 @@ impl JoinDataBothEmpty { pub fn is_empty(&self) -> bool { self.len() == 0 } + + /// Create an iterator over the joins. + pub fn iter(&self) -> JoinDataIterator<'_, (), ()> { + JoinDataIterator { + inner: self.joins.iter(), + left_data: None, + right_data: None, + } + } } pub struct CombinedJoinDataBothEmpty { pub join: LeftGroupedJoin, } - impl JoinDataOperations<(), ()> for CombinedJoinDataBothEmpty { type LeftDataElementType = (); type RightDataElementType = (); - fn join(&self) -> LeftGroupedJoin { - self.join + fn join(&self) -> &LeftGroupedJoin { + &self.join } - fn left_data(&self) -> Option { + fn left_data(&self) -> Option<&Self::LeftDataElementType> { None } - fn right_data(&self) -> Option> { + fn right_data(&self) -> Option<&Vec> { None } } - impl JoinDataBothEmpty { /// Apply `func` to each element, putting the results into the returned /// `Vec`. @@ -400,9 +416,6 @@ impl JoinDataBothEmpty { where F: Fn(CombinedJoinDataBothEmpty) -> V, { - // Cloning `left_data` and `right_data` to ensure they live long enough. - // This might not be the most efficient but ensures lifetime correctness. - self.joins .iter() .map(|join| func(CombinedJoinDataBothEmpty { join: join.clone() })) @@ -410,24 +423,24 @@ impl JoinDataBothEmpty { } } -///// An iterator over the [`LeftGroupedJoin`] types that represent -///// information about overlaps right ranges have with a particular left range. -///// -///// This also contains references to the left and right data containers, for -///// better ergonomics in downstream data processing. -//pub struct JoinDataIterator<'a, DL, DR> { -// inner: std::slice::Iter<'a, LeftGroupedJoin>, -// pub left_data: Option<&'a DL>, -// pub right_data: Option<&'a DR>, -//} -// -//impl<'a, DL, DR> Iterator for JoinDataIterator<'a, DL, DR> { -// type Item = &'a LeftGroupedJoin; -// -// fn next(&mut self) -> Option { -// self.inner.next() -// } -//} +/// An iterator over the [`LeftGroupedJoin`] types that represent +/// information about overlaps right ranges have with a particular left range. +/// +/// This also contains references to the left and right data containers, for +/// better ergonomics in downstream data processing. +pub struct JoinDataIterator<'a, DL, DR> { + inner: std::slice::Iter<'a, LeftGroupedJoin>, + pub left_data: Option<&'a DL>, + pub right_data: Option<&'a DR>, +} + +impl<'a, DL, DR> Iterator for JoinDataIterator<'a, DL, DR> { + type Item = &'a LeftGroupedJoin; + + fn next(&mut self) -> Option { + self.inner.next() + } +} #[cfg(test)] mod tests { diff --git a/src/lib.rs b/src/lib.rs index a73463b..c2b6324 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -159,8 +159,8 @@ pub mod prelude { pub use crate::traits::{ AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, GenomicRangeRecordUnwrappable, GenomicRangesTsvSerialize, IndexedDataContainer, - IntoIterableRangesContainer, IterableRangeContainer, TsvSerialize, - LeftOverlaps, + IntoDatumType, IntoIterableRangesContainer, IterableRangeContainer, JoinDataOperations, + LeftOverlaps, TsvSerialize, }; pub use crate::seqlens; diff --git a/src/main/mod.rs b/src/main/mod.rs index 2048bf1..282e379 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -2,9 +2,10 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; use granges::{ - commands::{granges_adjust, granges_filter, granges_flank, ProcessingMode}, + commands::{granges_adjust, granges_filter, granges_flank, granges_map, ProcessingMode}, + data::operations::Operation, prelude::GRangesError, - Position, PositionOffset, data::operations::{match_operation, Operation}, + Position, PositionOffset, }; #[cfg(feature = "dev-commands")] @@ -78,7 +79,7 @@ enum Commands { /// Skip ranges from sequences (e.g. chromosomes) not present in the genome file. /// By default, ranges with sequence names not in the genome file will raise an error. - #[arg(short = 'f', long)] + #[arg(short, long)] skip_missing: bool, }, Flank { @@ -108,7 +109,7 @@ enum Commands { /// Skip ranges from sequences (e.g. chromosomes) not present in the genome file. /// By default, ranges with sequence names not in the genome file will raise an error. - #[arg(short = 'f', long)] + #[arg(long)] skip_missing: bool, /// Processing mode @@ -128,9 +129,9 @@ enum Commands { #[arg(short, long, required = true)] right: PathBuf, - /// Operation - #[clap(short, long, value_parser = clap::value_parser!(Operation), multiple_occurrences(true))] - operation: Vec, + /// Operation + #[clap(short, long, value_parser = clap::value_parser!(Operation), use_value_delimiter = true, value_delimiter = ',')] + func: Vec, /// An optional output file (standard output will be used if not specified) #[arg(short, long)] @@ -138,12 +139,8 @@ enum Commands { /// Skip ranges from sequences (e.g. chromosomes) not present in the genome file. /// By default, ranges with sequence names not in the genome file will raise an error. - #[arg(short = 'f', long)] + #[arg(short, long)] skip_missing: bool, - - /// Processing mode - #[arg(long)] - in_mem: bool, }, #[cfg(feature = "dev-commands")] RandomBed { @@ -227,11 +224,21 @@ fn run() -> Result<(), GRangesError> { genome, left, right, - operation, + func, output, skip_missing, - in_mem, }) => { + if func.is_empty() { + return Err(GRangesError::NoOperationSpecified); + } + granges_map( + genome, + left, + right, + func.to_vec(), + output.as_ref(), + *skip_missing, + ) } #[cfg(feature = "dev-commands")] diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 39edda5..0ff77e4 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -6,6 +6,7 @@ use std::ops::Range; use crate::{ error::GRangesError, + io::tsv::TsvConfig, traits::{ AdjustableGenericRange, GenericRange, GenericRangeOperations, IndexedDataContainer, TsvSerialize, @@ -241,7 +242,8 @@ impl GenericRangeOperations for GenomicRangeRecord { } impl TsvSerialize for GenomicRangeRecord<()> { - fn to_tsv(&self) -> String { + #[allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { format!("{}\t{}\t{}", self.seqname, self.start, self.end) } } @@ -274,13 +276,14 @@ impl TsvSerialize for GenomicRangeRecord { /// intentional, since one common use case is when `U` is an [`Option`], /// which is the type combination commonly used in lazy parsing. In this case, /// when the data is `None`, that indicates no more columns. - fn to_tsv(&self) -> String { + #[allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { format!( - "{}\t{}\t{}{}", + "{}\t{}\t{}\t{}", self.seqname, self.start, self.end, - self.data.to_tsv() + self.data.to_tsv(config) ) } } @@ -357,7 +360,8 @@ impl GenericRangeOperations for GenomicRangeEmptyRecord { } impl TsvSerialize for GenomicRangeEmptyRecord { - fn to_tsv(&self) -> String { + #[allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { format!("{}\t{}\t{}", self.seqname, self.start, self.end) } } diff --git a/src/sequences/nucleotide.rs b/src/sequences/nucleotide.rs index 54d79dd..ac1c223 100644 --- a/src/sequences/nucleotide.rs +++ b/src/sequences/nucleotide.rs @@ -212,10 +212,12 @@ impl LazyNucleotideSequences { .index() .iter() .filter_map(|r| { - let name = String::from_utf8(r.name().to_vec()).expect(&format!( - "{}\nError in converting FASTA name to a UTF8 String.", - &INTERNAL_ERROR_MESSAGE - )); + let name = String::from_utf8(r.name().to_vec()).unwrap_or_else(|_| { + panic!( + "{}\nError in converting FASTA name to a UTF8 String.", + &INTERNAL_ERROR_MESSAGE + ) + }); if allowed_seqnames .as_ref() .map_or(true, |seqnames| seqnames.contains(&name)) diff --git a/src/traits.rs b/src/traits.rs index a3bc4fa..2d71aba 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -6,11 +6,16 @@ use std::path::PathBuf; use indexmap::IndexMap; use crate::{ + data::DatumType, error::GRangesError, granges::GRanges, - io::parsers::{FilteredRanges, UnwrappedRanges}, + io::{ + parsers::{FilteredRanges, UnwrappedRanges}, + tsv::TsvConfig, + }, + join::LeftGroupedJoin, ranges::GenomicRangeRecord, - Position, join::LeftGroupedJoin, + Position, }; /// Traits for [`GRanges`] types that can be modified. @@ -27,7 +32,7 @@ pub trait AsGRangesRef<'a, C, T> { } /// The [`LeftOverlaps`] trait provides compile time polymorphic behavior -/// over its associated [`LeftOverlaps::Output`] type and its `Right` +/// over its associated [`LeftOverlaps::Output`] type and its `Right` /// generic type. pub trait LeftOverlaps<'a, Right> { type Output; @@ -39,7 +44,11 @@ pub trait LeftOverlaps<'a, Right> { /// object, for some mix of generic types, to a TSV file. pub trait GenomicRangesTsvSerialize<'a, C: RangeContainer> { /// Output the TSV version of this [`GRanges`] object. - fn to_tsv(&'a self, output: Option>) -> Result<(), GRangesError>; + fn to_tsv( + &'a self, + output: Option>, + config: &TsvConfig, + ) -> Result<(), GRangesError>; } /// The [`GenericRange`] trait defines common functionality for all range types. @@ -78,10 +87,10 @@ pub trait GenericRange: Clone { } } -/// The [`JoinDataOperations`] trait unifies common operations -/// over combined join data types ([`CombinedJoinData`], +/// The [`JoinDataOperations`] trait unifies common operations +/// over combined join data types ([`CombinedJoinData`], /// CombinedJoinDataBothEmpty`], etc). -/// +/// /// /// [`CombinedJoinData`] crate::granges::join::CombinedJoinData /// [`CombinedJoinDataBothEmpty`] crate::granges::join::CombinedJoinDataBothEmpty @@ -89,9 +98,22 @@ pub trait JoinDataOperations { type LeftDataElementType; type RightDataElementType; - fn join(&self) -> LeftGroupedJoin; - fn left_data(&self) -> Option; - fn right_data(&self) -> Option>; + fn join(&self) -> &LeftGroupedJoin; + fn left_data(&self) -> Option<&Self::LeftDataElementType>; + fn right_data(&self) -> Option<&Vec>; +} + +/// Helper trait to convert selected fields into `DataType` +pub trait IntoDatumType { + fn into_data_type(self) -> DatumType; +} + +// TODO +pub trait Selection { + fn select_by_name(&self, name: &str) -> DatumType; + fn select(&self, names: &[String]) -> Vec { + names.iter().map(|name| self.select_by_name(name)).collect() + } } /// The [`GenericRangeOperations`] trait extends additional functionality to [`GenericRange`], @@ -274,33 +296,5 @@ pub trait Sequences<'a> { /// Defines how to serialize something to TSV. pub trait TsvSerialize { // Serialize something to a TSV [`String`]. - fn to_tsv(&self) -> String; -} - -impl TsvSerialize for &String { - fn to_tsv(&self) -> String { - self.to_string() - } -} - -impl TsvSerialize for String { - fn to_tsv(&self) -> String { - self.to_string() - } -} - -impl TsvSerialize for Option { - fn to_tsv(&self) -> String { - self.as_ref() - .map_or("".to_string(), |x| format!("\t{}", x.to_tsv())) - } -} - -impl TsvSerialize for Vec { - fn to_tsv(&self) -> String { - self.iter() - .map(|x| x.to_tsv()) - .collect::>() - .join("\t") - } + fn to_tsv(&self, config: &TsvConfig) -> String; }