From 828d9e28adea276e3953de1e9b2b209f7c6d6b6e Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Thu, 15 Feb 2024 17:39:31 -0800 Subject: [PATCH] Adjust subcommand, output writing for basic BED, reporting API - granges_adjust and adjust subcommand - random granges - dev-commands feature & RandomBed - to_bed3() - empty range iterator - cleaner output to standard out - reporting API - TSV trait and impls --- Cargo.toml | 6 ++- src/error.rs | 2 +- src/granges.rs | 50 +++++++++++++++++++--- src/io/{io.rs => file.rs} | 45 ++++++++++++++------ src/io/mod.rs | 6 +-- src/io/parsers.rs | 22 +++++----- src/iterators.rs | 57 +++++++++++++++++++++++++- src/lib.rs | 7 +++- src/main/commands.rs | 65 +++++++++++++++++++++++++---- src/main/mod.rs | 35 +++++++++++++--- src/main/reporting.rs | 28 +++++++++++++ src/ranges/coitrees.rs | 26 ++++++------ src/ranges/mod.rs | 61 +++++++++++++++++++++++++-- src/ranges/operations.rs | 82 +++++++++++++++++++++++++++++++++---- src/ranges/vec.rs | 16 ++++++++ src/test_utilities.rs | 32 ++++++++++++++- src/traits.rs | 24 ++++++++++- tests_data/hg38_seqlens.tsv | 22 ++++++++++ 18 files changed, 508 insertions(+), 78 deletions(-) rename src/io/{io.rs => file.rs} (85%) create mode 100644 src/main/reporting.rs create mode 100644 tests_data/hg38_seqlens.tsv diff --git a/Cargo.toml b/Cargo.toml index 0e635ac..f5c2ce5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,8 +22,10 @@ noodles = { version = "0.63.0", features = ["core", "bed"] } rand = "0.8.5" thiserror = "1.0.57" -# [features] -# cli = [ "clap" ] +[features] +# cli = [ "clap" ] // TODO make feature +dev-commands = [ ] + [[bin]] name = "granges" diff --git a/src/error.rs b/src/error.rs index 7feabc3..6876a7c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,4 +1,4 @@ -use std::num::{ParseIntError, ParseFloatError}; +use std::num::{ParseFloatError, ParseIntError}; use genomap::GenomeMapError; use thiserror::Error; diff --git a/src/granges.rs b/src/granges.rs index e6b679a..7088610 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -1,15 +1,19 @@ +use std::path::PathBuf; + use genomap::GenomeMap; use indexmap::IndexMap; use crate::{ + io::OutputFile, + iterators::{GRangesEmptyIterator, GRangesIterator}, prelude::GRangesError, ranges::{ coitrees::{COITrees, COITreesIndexed}, vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, RangeEmpty, RangeIndexed, RangeRecord, }, - traits::{RangeContainer, RangesIterable}, - Position, iterators::GRangesIterator, + traits::{RangeContainer, RangesIterable, TsvSerialize}, + Position, }; #[derive(Clone, Debug)] @@ -145,6 +149,27 @@ impl GRanges { } } +impl GRanges +where + R: RangeContainer + RangesIterable, +{ + // TODO: candidate for a trait + pub fn to_bed3(&self, output: Option>) -> Result<(), GRangesError> { + // output stream -- header is None for now (TODO) + let output = output.map_or(OutputFile::new_stdout(None), |file| { + OutputFile::new(file, None) + }); + let mut writer = output.writer()?; + + let seqnames = self.seqnames(); + for range in self.iter_ranges_empty() { + let record = range.to_record(&seqnames); + writeln!(writer, "{}", record.to_tsv())?; + } + Ok(()) + } +} + impl GRanges { /// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree /// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library @@ -163,10 +188,25 @@ impl GRanges { } } -impl GRanges -where R: RangesIterable { +impl GRanges +where + R: RangesIterable, +{ + /// Create a new [`GRangesIterator`] to iterate through all + /// the ranges in this [`GRanges`] object. These ranges carry + /// no data index, unlike the method [`GRanges.iter_ranges()`] + /// available for range type for associated data containers. + pub fn iter_ranges_empty(&self) -> GRangesEmptyIterator<'_, R> { + GRangesEmptyIterator::new(&self.ranges) + } +} + +impl GRanges +where + R: RangesIterable, +{ /// Create a new [`GRangesIterator`] to iterate through all the ranges in this [`GRanges`] object. - pub fn iter_ranges<'a>(&'a self) -> GRangesIterator<'a, R> { + pub fn iter_ranges(&self) -> GRangesIterator<'_, R> { GRangesIterator::new(&self.ranges) } } diff --git a/src/io/io.rs b/src/io/file.rs similarity index 85% rename from src/io/io.rs rename to src/io/file.rs index f0d62a2..8fc69b7 100644 --- a/src/io/io.rs +++ b/src/io/file.rs @@ -13,11 +13,13 @@ use std::io::{self, BufWriter}; use std::io::{BufRead, BufReader, Read}; use std::path::PathBuf; -use crate::Position; use crate::error::GRangesError; +use crate::Position; /// Read a tab-delimited *genome file* of sequence (i.e. chromosome) names and their lengths. -pub fn read_seqlens(filepath: impl Into) -> Result, GRangesError> { +pub fn read_seqlens( + filepath: impl Into, +) -> Result, GRangesError> { let input_file = InputFile::new(filepath); let reader = input_file.reader()?; @@ -136,12 +138,17 @@ impl InputFile { } } +enum OutputDestination { + File(PathBuf), + Stdout, +} + /// Represents an output file. /// /// This struct is used to handle operations on an output file, such as writing to the file. /// This abstracts writing both plaintext and gzip-compressed files. pub struct OutputFile { - pub filepath: PathBuf, + destination: OutputDestination, pub header: Option>, } @@ -155,7 +162,15 @@ impl OutputFile { /// * `header` - An optional vector of strings representing commented header lines to be written to the file. pub fn new(filepath: impl Into, header: Option>) -> Self { Self { - filepath: filepath.into(), + destination: OutputDestination::File(filepath.into()), + header, + } + } + + /// Constructs a new [`OutputFile`] for standard output. + pub fn new_stdout(header: Option>) -> Self { + Self { + destination: OutputDestination::Stdout, header, } } @@ -170,15 +185,19 @@ impl OutputFile { /// /// A result containing a `Box` on success, or an `io::Error` on failure. pub fn writer(&self) -> io::Result> { - let outfile = &self.filepath; - let is_gzip = outfile.ends_with(".gz"); - let mut writer: Box = if is_gzip { - Box::new(BufWriter::new(GzEncoder::new( - File::create(outfile)?, - Compression::default(), - ))) - } else { - Box::new(BufWriter::new(File::create(outfile)?)) + let mut writer: Box = match &self.destination { + OutputDestination::File(path) => { + let is_gzip = path.ends_with(".gz"); + if is_gzip { + Box::new(BufWriter::new(GzEncoder::new( + File::create(path)?, + Compression::default(), + ))) + } else { + Box::new(BufWriter::new(File::create(path)?)) + } + } + OutputDestination::Stdout => Box::new(io::stdout()), }; // write header if one is set if let Some(entries) = &self.header { diff --git a/src/io/mod.rs b/src/io/mod.rs index 10df12c..44003a0 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -1,9 +1,9 @@ //! Input/Output //! -pub mod io; +pub mod file; pub mod noodles; pub mod parsers; -pub use io::{InputFile, OutputFile}; -pub use parsers::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator}; +pub use file::{InputFile, OutputFile}; +pub use parsers::{Bed3RecordIterator, BedlikeIterator, TsvRecordIterator}; diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 574f0af..7b3faf8 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -6,11 +6,11 @@ use std::collections::HashSet; use std::io::{BufRead, BufReader, Read}; use std::path::PathBuf; -use crate::Position; use crate::error::GRangesError; -use crate::io::io::InputFile; +use crate::io::file::InputFile; use crate::ranges::RangeRecord; use crate::traits::GeneralRangeRecordIterator; +use crate::Position; use super::noodles::convert_noodles_position; @@ -101,6 +101,7 @@ where /// A BED-like file parser. This works by lazy-parsing the first three /// columns, which are standard to all BED files. +#[allow(clippy::type_complexity)] pub struct BedlikeIterator { iter: TsvRecordIterator Result, GRangesError>, String>, } @@ -109,7 +110,7 @@ impl BedlikeIterator { pub fn new(filepath: impl Into) -> Result { // Wrap the parse_bedlike_to_range_record function to conform with TsvRecordIterator's expectations. let parser: fn(&str) -> Result, GRangesError> = parse_bed_lazy; - + let iter = TsvRecordIterator::new(filepath, parser)?; Ok(Self { iter }) } @@ -123,8 +124,6 @@ impl Iterator for BedlikeIterator { } } - - /// An iterator over [`IntervalRecord`] items that filters based on sequence name. /// /// Note that that the exclude filter is prioritized over the retain filter. So, @@ -282,7 +281,11 @@ pub fn parse_bed_lazy(line: &str) -> Result, GRangesError> { let start: Position = parse_column(columns[1], line)?; let end: Position = parse_column(columns[2], line)?; - let data = columns[3].to_string(); + let data = if columns.len() > 3 { + columns[3].to_string() + } else { + String::new() + }; Ok(RangeRecord { seqname, @@ -312,7 +315,7 @@ mod tests { // let first_interval = gr_iter.next().unwrap(); // assert_eq!(first_interval.first, 7); // assert_eq!(first_interval.last, 12); - // + // // let second_interval = gr_iter.next().unwrap(); // assert_eq!(second_interval.first, 20); // assert_eq!(second_interval.last, 33); @@ -336,9 +339,6 @@ mod tests { // note: the Rust LSP thinks this isn't used for some reason, so prefaced with _ // to silence warnings. let _msg = "column '-1' in 'chr1\t-1\t20'".to_string(); - assert!(matches!( - result, - Err(GRangesError::InvalidColumnType(_msg)) - )); + assert!(matches!(result, Err(GRangesError::InvalidColumnType(_msg)))); } } diff --git a/src/iterators.rs b/src/iterators.rs index d032d78..5b8a13f 100644 --- a/src/iterators.rs +++ b/src/iterators.rs @@ -1,11 +1,11 @@ use genomap::GenomeMap; use crate::{ - ranges::{RangeIndexed, RangeIndexedRecord}, + ranges::{RangeEmpty, RangeEmptyRecord, RangeIndexed, RangeIndexedRecord}, traits::{RangeContainer, RangesIterable}, }; -/// An iterator over [`RangeIndexedRecord`], which store +/// An iterator yielding [`RangeIndexedRecord`], which store /// indices to the sequence names and data container. /// /// # Developer Notes @@ -63,6 +63,59 @@ where } } +/// An iterator over [`RangeEmptyRecord`], which store +/// indices to the sequence names (but carries no data index). +pub struct GRangesEmptyIterator<'a, R> { + ranges: &'a GenomeMap, + current_seqname_index: usize, + current_range_iter: Box + 'a>, +} + +impl<'a, R> GRangesEmptyIterator<'a, R> +where + R: RangesIterable, +{ + pub fn new(ranges: &'a GenomeMap) -> Self { + let current_range_iter = ranges.get_by_index(0).unwrap().iter_ranges(); + Self { + ranges, + current_seqname_index: 0, + current_range_iter, + } + } +} + +impl<'a, R> Iterator for GRangesEmptyIterator<'a, R> +where + R: RangeContainer + RangesIterable, +{ + type Item = RangeEmptyRecord; + + fn next(&mut self) -> Option { + loop { + if let Some(next_range) = self.current_range_iter.next() { + return Some(RangeEmptyRecord { + seqname_index: self.current_seqname_index, + start: next_range.start, + end: next_range.end, + }); + } else { + // try to load another sequence's set of ranges. + self.current_seqname_index += 1; + if self.current_seqname_index >= self.ranges.len() { + // we're out of range container iterators + return None; + } + self.current_range_iter = self + .ranges + .get_by_index(self.current_seqname_index) + .unwrap() + .iter_ranges(); + } + } + } +} + #[cfg(test)] mod tests { use crate::{ranges::RangeIndexedRecord, test_utilities::granges_test_case_01}; diff --git a/src/lib.rs b/src/lib.rs index 5620e6e..19b1484 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,10 +19,13 @@ pub type PositionOffset = i32; // signed variant pub mod prelude { pub use crate::error::GRangesError; pub use crate::granges::GRanges; - pub use crate::io::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator}; + pub use crate::io::file::read_seqlens; + pub use crate::io::{Bed3RecordIterator, BedlikeIterator, TsvRecordIterator}; pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; - pub use crate::traits::{GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable}; + pub use crate::traits::{ + GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable, TsvSerialize, + }; pub use crate::seqlens; } diff --git a/src/main/commands.rs b/src/main/commands.rs index a71e774..56dd90a 100644 --- a/src/main/commands.rs +++ b/src/main/commands.rs @@ -1,22 +1,73 @@ use std::path::PathBuf; -use granges::io::io::read_seqlens; -use granges::prelude::*; - +use granges::io::OutputFile; +use granges::ranges::operations::adjust_range_record; +use granges::test_utilities::random_granges; +use granges::{prelude::*, PositionOffset}; +use crate::reporting::{CommandOutput, Report}; /// Adjust the genomic ranges in a bedfile by some specified amount. // NOTE: we don't do build the full GRanges objects here, for efficiency. // But it would be a good benchmark to see how much slower that is. -pub fn granges_adjust(bedfile: &PathBuf, seqlens: &PathBuf, both: &i32, output: Option<&PathBuf>) -> Result<(), GRangesError> { +pub fn granges_adjust( + bedfile: &PathBuf, + seqlens: &PathBuf, + both: PositionOffset, + output: Option<&PathBuf>, +) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; + // input iterator let bedlike_iterator = BedlikeIterator::new(bedfile)?; + // output stream -- header is None for now (TODO) + let output = output.map_or(OutputFile::new_stdout(None), |file| { + OutputFile::new(file, None) + }); + let mut writer = output.writer()?; + + // for reporting stuff to the user + let mut report = Report::new(); + + let mut skipped_ranges = 0; for record in bedlike_iterator { let range = record?; - let seqname = range.seqname; - let length = *genome.get(&seqname).ok_or(GRangesError::MissingSequence(seqname))?; + let seqname = &range.seqname; + let length = *genome + .get(seqname) + .ok_or(GRangesError::MissingSequence(seqname.to_string()))?; + + let possibly_adjusted_range = adjust_range_record(range, -both, both, length); + if let Some(range_adjusted) = possibly_adjusted_range { + writeln!(writer, "{}", range_adjusted.to_tsv())?; + } else { + skipped_ranges += 1; + } + + if skipped_ranges > 0 { + report.add_issue(format!( + "{} ranges were removed because their widths after adjustment were ≤ 0", + skipped_ranges + )) + } } - Ok(()) + Ok(CommandOutput::new((), report)) +} + +/// Generate a random BED-like file with genomic ranges. +pub fn granges_random_bed( + seqlens: &PathBuf, + num: u32, + output: Option<&PathBuf>, +) -> Result, GRangesError> { + // get the genome info + let genome = read_seqlens(seqlens)?; + + let gr = random_granges(&genome, num.try_into().unwrap())?; + + gr.to_bed3(output)?; + + let report = Report::new(); + Ok(CommandOutput::new((), report)) } diff --git a/src/main/mod.rs b/src/main/mod.rs index c944c02..ec27124 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -1,9 +1,11 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; -use granges::prelude::GRangesError; +use commands::granges_random_bed; +use granges::{prelude::GRangesError, PositionOffset}; pub mod commands; +pub mod reporting; use crate::commands::granges_adjust; const INFO: &str = "\ @@ -38,26 +40,49 @@ enum Commands { bedfile: PathBuf, /// number of basepairs to expand the range start and end positions by #[arg(long)] - both: i32, + both: PositionOffset, /// an optional output file (standard output will be used if not specified) + #[arg(long)] + output: Option, + }, + + #[cfg(feature = "dev-commands")] + RandomBed { + /// a TSV genome file of chromosome names and their lengths + #[arg(required = true)] + seqlens: PathBuf, + /// number of random ranges to generate + #[arg(long, required = true)] + num: u32, + /// an optional output file (standard output will be used if not specified) + #[arg(long)] output: Option, }, } fn run() -> Result<(), GRangesError> { let cli = Cli::parse(); - match &cli.command { + let result = match &cli.command { Some(Commands::Adjust { bedfile, seqlens, both, output, - }) => granges_adjust(bedfile, seqlens, both, output.as_ref()), + }) => granges_adjust(bedfile, seqlens, *both, output.as_ref()), + #[cfg(feature = "dev-commands")] + Some(Commands::RandomBed { + seqlens, + num, + output, + }) => granges_random_bed(seqlens, *num, output.as_ref()), None => { println!("{}\n", INFO); std::process::exit(1); } - } + }; + let _output = result?; + // TODO do something with reporting, etc. + Ok(()) } fn main() { diff --git a/src/main/reporting.rs b/src/main/reporting.rs new file mode 100644 index 0000000..3ac2c30 --- /dev/null +++ b/src/main/reporting.rs @@ -0,0 +1,28 @@ +//! Centralized reporting to the user about e.g. fragile operations. +//! + +pub struct CommandOutput { + value: U, + report: Report, +} + +impl CommandOutput { + pub fn new(value: U, report: Report) -> Self { + Self { value, report } + } +} + +#[derive(Default)] +pub struct Report { + entries: Vec, +} + +impl Report { + pub fn new() -> Self { + Self::default() + } + + pub fn add_issue(&mut self, message: String) { + self.entries.push(message) + } +} diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index 78aa8ad..78f2cf4 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -1,6 +1,6 @@ use coitrees::{BasicCOITree, GenericInterval, Interval, IntervalNode, IntervalTree}; -use crate::{error::GRangesError, traits::{RangeContainer}, traits::RangesIterable, Position}; +use crate::{error::GRangesError, traits::RangeContainer, traits::RangesIterable, Position}; use super::{validate_range, vec::VecRanges, RangeEmpty, RangeIndexed}; @@ -45,8 +45,8 @@ pub struct COITrees { impl std::fmt::Debug for COITrees { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("COITrees") - .field("number of ranges:", &self.ranges.len()) - .field("length", &self.length) + .field("number of ranges:", &self.ranges.len()) + .field("length", &self.length) .finish() } } @@ -60,15 +60,15 @@ impl COITrees { /// Query this range container for a particular range, and call a visit function on all /// overlapping ranges. pub fn query(&self, start: Position, end: Position, visit: F) - where + where F: FnMut(&IntervalNode), - { - // Note the terminology change to match coitrees (and uses i32s) - let first = start.try_into().expect("could not covert"); - let end: i32 = end.try_into().expect("could not covert"); - // internally coitrees uses 0-indexed, right-inclusive "last" - self.ranges.query(first, end - 1, visit) - } + { + // Note the terminology change to match coitrees (and uses i32s) + let first = start.try_into().expect("could not covert"); + let end: i32 = end.try_into().expect("could not covert"); + // internally coitrees uses 0-indexed, right-inclusive "last" + self.ranges.query(first, end - 1, visit) + } /// Return the number of ranges in this [`COITreeRangeContainer`] container. pub fn len(&self) -> usize { @@ -139,7 +139,7 @@ impl From> for RangeIndexed { impl RangesIterable for COITrees { fn iter_ranges(&self) -> Box + '_> { let iter = self.ranges.iter(); - let converted_iter = iter.map(|interval| RangeIndexed::from(interval)); + let converted_iter = iter.map(RangeIndexed::from); Box::new(converted_iter) } } @@ -147,7 +147,7 @@ impl RangesIterable for COITrees { impl RangesIterable for COITrees<()> { fn iter_ranges(&self) -> Box + '_> { let iter = self.ranges.iter(); - let converted_iter = iter.map(|interval| RangeEmpty::from(interval)); + let converted_iter = iter.map(RangeEmpty::from); Box::new(converted_iter) } } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index d04742b..740e42c 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -2,11 +2,11 @@ //! //! -use crate::{error::GRangesError, Position}; +use crate::{error::GRangesError, traits::TsvSerialize, Position}; pub mod coitrees; -pub mod vec; pub mod operations; +pub mod vec; #[derive(Clone, Debug, Default, PartialEq)] pub struct RangeEmpty { @@ -52,7 +52,62 @@ pub struct RangeRecord { pub data: U, } -/// Represents a range entry, possible with indices to sequence name and data. +impl RangeRecord { + pub fn new(seqname: String, start: Position, end: Position, data: U) -> Self { + Self { + seqname, + start, + end, + data, + } + } +} + +impl TsvSerialize for RangeRecord<()> { + fn to_tsv(&self) -> String { + format!("{}\t{}\t{}", self.seqname, self.start, self.end) + } +} + +impl TsvSerialize for RangeRecord { + fn to_tsv(&self) -> String { + format!( + "{}\t{}\t{}\t{}", + self.seqname, + self.start, + self.end, + self.data.to_tsv() + ) + } +} + +/// Represents a range entry without data. +#[derive(Debug, Clone, PartialEq)] +pub struct RangeEmptyRecord { + pub seqname_index: usize, + pub start: Position, + pub end: Position, +} + +impl RangeEmptyRecord { + pub fn new(seqname_index: usize, start: Position, end: Position) -> Self { + Self { + seqname_index, + start, + end, + } + } + pub fn to_record(self, seqnames: &Vec) -> RangeRecord<()> { + RangeRecord { + seqname: seqnames[self.seqname_index].clone(), + start: self.start, + end: self.end, + data: (), + } + } +} + +/// Represents a range entry, with indices to sequence name and data. #[derive(Debug, Clone, PartialEq)] pub struct RangeIndexedRecord { pub seqname_index: usize, diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs index 1c7feb7..a32a501 100644 --- a/src/ranges/operations.rs +++ b/src/ranges/operations.rs @@ -2,13 +2,18 @@ //! //! - [`adjust()`]: Adjust range start and end positions. -use crate::{PositionOffset, Position}; +use crate::{Position, PositionOffset}; -use super::RangeIndexed; +use super::{RangeEmpty, RangeIndexed, RangeRecord}; /// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] /// and returning `None` if the range has zero width after adjustment. -pub fn adjust(range: RangeIndexed, start_delta: PositionOffset, end_delta: PositionOffset, length: Position) -> Option { +pub fn adjust_range_record( + range: RangeRecord, + start_delta: PositionOffset, + end_delta: PositionOffset, + length: Position, +) -> Option> { let start: PositionOffset = range.start.try_into().unwrap(); let end: PositionOffset = range.end.try_into().unwrap(); let length: PositionOffset = length.try_into().unwrap(); @@ -16,14 +21,78 @@ pub fn adjust(range: RangeIndexed, start_delta: PositionOffset, end_delta: Posit // ensure within [0, length] let new_start = (start + start_delta).max(0).min(length); // ensure new_end >= new_start and within [0, length] - let new_end = (end + end_delta).max(new_start).min(length); + let new_end = (end + end_delta).max(new_start).min(length); // check for zero-width range if new_end <= new_start { // return None if the range has zero width None } else { - Some(RangeIndexed::new(new_start.try_into().unwrap(), new_end.try_into().unwrap(), range.index)) + Some(RangeRecord::new( + range.seqname, + new_start.try_into().unwrap(), + new_end.try_into().unwrap(), + range.data, + )) + } +} + +/// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] +/// and returning `None` if the range has zero width after adjustment. +pub fn adjust_empty( + range: RangeEmpty, + start_delta: PositionOffset, + end_delta: PositionOffset, + length: Position, +) -> Option { + let start: PositionOffset = range.start.try_into().unwrap(); + let end: PositionOffset = range.end.try_into().unwrap(); + let length: PositionOffset = length.try_into().unwrap(); + + // ensure within [0, length] + let new_start = (start + start_delta).max(0).min(length); + // ensure new_end >= new_start and within [0, length] + let new_end = (end + end_delta).max(new_start).min(length); + + // check for zero-width range + if new_end <= new_start { + // return None if the range has zero width + None + } else { + Some(RangeEmpty::new( + new_start.try_into().unwrap(), + new_end.try_into().unwrap(), + )) + } +} + +/// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length] +/// and returning `None` if the range has zero width after adjustment. +pub fn adjust( + range: RangeIndexed, + start_delta: PositionOffset, + end_delta: PositionOffset, + length: Position, +) -> Option { + let start: PositionOffset = range.start.try_into().unwrap(); + let end: PositionOffset = range.end.try_into().unwrap(); + let length: PositionOffset = length.try_into().unwrap(); + + // ensure within [0, length] + let new_start = (start + start_delta).max(0).min(length); + // ensure new_end >= new_start and within [0, length] + let new_end = (end + end_delta).max(new_start).min(length); + + // check for zero-width range + if new_end <= new_start { + // return None if the range has zero width + None + } else { + Some(RangeIndexed::new( + new_start.try_into().unwrap(), + new_end.try_into().unwrap(), + range.index, + )) } } @@ -44,11 +113,10 @@ mod tests { let adjusted = adjust(range, -5, 20, 15).unwrap(); assert_eq!(adjusted, RangeIndexed::new(5, 15, 2)); } - + #[test] fn test_zero_width_result() { let range = RangeIndexed::new(5, 10, 3); assert!(adjust(range, 5, -5, 15).is_none()); } } - diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index 0f0dea2..2cd5ae6 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -62,3 +62,19 @@ impl RangesIterable for VecRanges { Box::new(converted_iter) } } + +impl RangesIntoIterable for VecRanges { + fn into_iter_ranges(self) -> Box> { + let iter = self.ranges.into_iter(); + Box::new(iter) + } +} + +impl RangesIterable for VecRanges { + fn iter_ranges(&self) -> Box + '_> { + let iter = self.ranges.iter(); + // NOTE: RangeIndexed is copyable but there still is overhead here + let converted_iter = iter.map(|interval| interval.to_owned()); + Box::new(converted_iter) + } +} diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 4b330c0..f1f0d63 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -3,11 +3,17 @@ use crate::{ create_granges_with_seqlens, + error::GRangesError, prelude::{GRanges, VecRangesIndexed}, - ranges::{coitrees::COITrees, vec::VecRanges, RangeEmpty}, + ranges::{ + coitrees::COITrees, + vec::{VecRanges, VecRangesEmpty}, + RangeEmpty, + }, Position, }; -use rand::{thread_rng, Rng}; +use indexmap::IndexMap; +use rand::{seq::SliceRandom, thread_rng, Rng}; // Stochastic test ranges defaults // @@ -59,6 +65,28 @@ pub fn random_vecranges(n: usize) -> VecRanges { vr } +/// Build a random [`GRanges, + num: usize, +) -> Result, GRangesError> { + let mut rng = thread_rng(); + + let mut gr: GRanges = GRanges::new_vec(seqlens); + + let seqnames: Vec = seqlens.keys().cloned().collect(); + for _ in 0..num { + let seqname = seqnames.choose(&mut rng).unwrap(); + let chrom_len = *seqlens + .get(seqname) + .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?; + let (start, end) = random_range(chrom_len); + gr.push_range_empty(&seqname, start, end)?; + } + Ok(gr) +} + /// Build random [`COITrees`] from a random [`VecRanges`]. pub fn random_coitrees() -> COITrees<()> { let vr = random_vecranges(100); diff --git a/src/traits.rs b/src/traits.rs index b7999ee..1461c6f 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -3,9 +3,8 @@ use crate::{error::GRangesError, io::parsers::FilteredIntervals, ranges::RangeRecord}; - /// Defines functionality common to all range containers, e.g. [`VecRanges`] and -/// [`COITrees`]. +/// [`COITrees`]. pub trait RangeContainer { fn len(&self) -> usize; fn is_empty(&self) -> bool { @@ -82,3 +81,24 @@ pub trait IndexedDataContainer<'a> { /// Create a new data container using a set of range indices. fn new_from_indices(&self, indices: &[usize]) -> Self::Output; } + +/// Defines how to serialize something to TSV. +pub trait TsvSerialize { + // Serialize something to a TSV [`String`]. + fn to_tsv(&self) -> String; +} + +impl TsvSerialize for String { + fn to_tsv(&self) -> String { + self.to_string() + } +} + +impl TsvSerialize for Vec { + fn to_tsv(&self) -> String { + self.iter() + .map(|x| x.to_tsv()) + .collect::>() + .join("\t") + } +} diff --git a/tests_data/hg38_seqlens.tsv b/tests_data/hg38_seqlens.tsv new file mode 100644 index 0000000..c83e74d --- /dev/null +++ b/tests_data/hg38_seqlens.tsv @@ -0,0 +1,22 @@ +chr1 248956422 +chr2 242193529 +chr3 198295559 +chr4 190214555 +chr5 181538259 +chr6 170805979 +chr7 159345973 +chr8 145138636 +chr9 138394717 +chr10 133797422 +chr11 135086622 +chr12 133275309 +chr13 114364328 +chr14 107043718 +chr15 101991189 +chr16 90338345 +chr17 83257441 +chr18 80373285 +chr19 58617616 +chr20 64444167 +chr21 46709983 +chr22 50818468