diff --git a/Cargo.toml b/Cargo.toml index a4cd622..0e635ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,12 +2,32 @@ name = "granges" version = "0.1.0" edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +license = "MIT" +authors = ["Vince Buffalo "] +keywords = ["genomics", "bioinformatics", "compbio"] +categories = ["science"] +documentation = "https://docs.rs/granges/" +repository = "https://github.com/vsbuffalo/granges" +description = "A Rust library and command line tool for genomic range operations." [dependencies] +# clap = { version = "4.4.18", features = ["derive"], optional = true } +clap = { version = "4.4.18", features = ["derive"] } coitrees = { version = "0.4.0", features = ["nosimd"] } +flate2 = "1.0.28" genomap = "0.2.6" indexmap = "2.2.3" +ndarray = "0.15.6" +noodles = { version = "0.63.0", features = ["core", "bed"] } rand = "0.8.5" thiserror = "1.0.57" + +# [features] +# cli = [ "clap" ] + +[[bin]] +name = "granges" +path = "src/main/mod.rs" +# required-features = ["cli"] + + diff --git a/src/data/mod.rs b/src/data/mod.rs new file mode 100644 index 0000000..a2966e8 --- /dev/null +++ b/src/data/mod.rs @@ -0,0 +1,3 @@ +//! Data container implementations. + +pub mod vec; diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs new file mode 100644 index 0000000..2885f34 --- /dev/null +++ b/src/data/ndarray.rs @@ -0,0 +1,62 @@ +//! Data container implementations for [`ndarray::Array1`] and [`ndarray::Array2`]. + +use ndarray::{Array1, Array2, ArrayView1}; +use crate::traits::IndexedDataContainer; + +impl<'a, U> IndexedDataContainer<'a> for Array1 +where + U: Copy + Default + 'a, +{ + type Item = U; + type Output = Array1; + + fn get_value(&'a self, index: usize) -> Self::Item { + self[index] + } + + fn len(&self) -> usize { + self.len() + } + + fn is_valid_index(&self, index: usize) -> bool { + index < self.shape()[0] + } + + fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx))) + } +} + +impl<'a, U> IndexedDataContainer<'a> for Array2 +where + U: Copy + Default + 'a, +{ + type Item = ArrayView1<'a, U>; + type Output = Array2; + + fn get_value(&'a self, index: usize) -> Self::Item { + self.row(index) + } + + fn len(&self) -> usize { + self.shape()[0] + } + + fn is_valid_index(&self, index: usize) -> bool { + index < self.shape()[0] + } + + fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + let cols = self.shape()[1]; + + let rows_data: Vec = indices + .iter() + .flat_map(|&idx| self.row(idx).iter().cloned().collect::>()) + .collect(); + + // create a new Array2 from the rows + // shape is (number of indices, number of columns) + Array2::from_shape_vec((indices.len(), cols), rows_data) + .expect("Shape and collected data size mismatch") + } +} diff --git a/src/data/vec.rs b/src/data/vec.rs new file mode 100644 index 0000000..a794fe4 --- /dev/null +++ b/src/data/vec.rs @@ -0,0 +1,33 @@ +//! Data container implementations for [`Vec`]. + + +/// Trait methods for the commonly-used `Vec` data container. +/// +/// Note that the associated `Item` type is always a *reference* to the data elements. +impl<'a, U> IndexedDataContainer<'a> for Vec +where + U: Clone + 'a, +{ + type Item = &'a U; + type Output = Vec; + + fn get_value(&'a self, index: usize) -> Self::Item { + self.get(index).unwrap() + } + + fn len(&self) -> usize { + self.len() + } + + fn is_valid_index(&self, index: usize) -> bool { + self.get(index).is_some() + } + + fn new_from_indices(&self, indices: &[usize]) -> Self::Output { + Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone())) + } +} + + + + diff --git a/src/error.rs b/src/error.rs index f66c007..371da82 100644 --- a/src/error.rs +++ b/src/error.rs @@ -5,15 +5,23 @@ use crate::Position; #[derive(Debug, Error)] pub enum GRangesError { + // IO related errors + #[error("File reading eror: {0}")] + IOError(#[from] std::io::Error), + + // File parsing related errors + #[error("Bed-like file has too few columns. The first three columns must be sequence name, and start and end positions.\nLine: {0}")] + BedlikeTooFewColumns(String), + #[error("File has invalid column type entry: {0}")] + InvalidColumnType(String), + + // Invalid genomic range errors #[error("Range invalid: start ({0}) must be greater than end ({1})")] InvalidGenomicRange(Position, Position), - #[error("Range [{0}, {1}] is invalid for sequence of length {2}")] InvalidGenomicRangeForSequence(Position, Position, Position), - #[error("Sequence name '{0}' is not the ranges container")] MissingSequence(String), - #[error("Error encountered in genomap::GenomeMap")] GenomeMapError(#[from] GenomeMapError), } diff --git a/src/granges.rs b/src/granges.rs index f57b655..7ff4f04 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -2,13 +2,14 @@ use genomap::GenomeMap; use indexmap::IndexMap; use crate::{ + io::RangeRecord, prelude::GRangesError, ranges::{ coitrees::{COITrees, COITreesIndexed}, vec::{VecRanges, VecRangesEmpty, VecRangesIndexed}, RangeEmpty, RangeIndexed, }, - traits::RangeContainer, + traits::{RangeContainer, RangesIterable, IndexedDataContainer}, Position, }; @@ -31,6 +32,10 @@ where pub fn is_empty(&self) -> bool { self.len() == 0 } + + pub fn get_ranges(&self, seqname: &str) -> Option<&C> { + self.ranges.get(seqname) + } } impl GRanges> { @@ -105,6 +110,23 @@ impl GRanges { } } +impl GRanges> { + pub fn from_iter( + iter: I, + seqlens: IndexMap, + ) -> Result>, GRangesError> + where + I: Iterator, GRangesError>>, + { + let mut gr = GRanges::new_vec(&seqlens); + for possible_entry in iter { + let entry = possible_entry?; + gr.push_range_with_data(&entry.seqname, entry.first, entry.last, entry.data)?; + } + Ok(gr) + } +} + impl GRanges { /// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree /// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library @@ -123,6 +145,7 @@ impl GRanges { } } + #[cfg(test)] mod tests { use indexmap::indexmap; diff --git a/src/io/io.rs b/src/io/io.rs new file mode 100644 index 0000000..1fd5d42 --- /dev/null +++ b/src/io/io.rs @@ -0,0 +1,170 @@ +//! Input/Output file handling with [`InputFile`] and [`OutputFile`]. +//! +//! These types abstract over reading/writing both plaintext and gzip-compressed +//! input/output. + +use flate2::read::GzDecoder; +use flate2::write::GzEncoder; +use flate2::Compression; +use std::fs::File; +use std::io::Write; +use std::io::{self, BufWriter}; +use std::io::{BufRead, BufReader, Read}; + +/// Check if a file is a gzipped by looking for the magic numbers +fn is_gzipped_file(file_path: &str) -> io::Result { + let mut file = File::open(file_path)?; + let mut buffer = [0; 2]; + file.read_exact(&mut buffer)?; + + Ok(buffer == [0x1f, 0x8b]) +} + +/// Represents an input file. +/// +/// This struct is used to handle operations on an input file, such as reading from the file. +/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input +/// to be read through a common interface. +pub struct InputFile { + pub filepath: String, + pub comments: Option>, + pub header: Option, + pub skip_lines: usize, +} + +impl InputFile { + /// Constructs a new `InputFile`. + /// + /// # Arguments + /// + /// * `filepath` - A string slice that holds the path to the file. If the file extension is + /// `.gz`, `InputFile` will automatically uncompress the input. + pub fn new(filepath: &str) -> Self { + Self { + filepath: filepath.to_string(), + comments: None, + header: None, + skip_lines: 0, + } + } + + /// Opens the file and returns a buffered reader. + /// + /// If the file is gzip-compressed (indicated by a ".gz" extension), this method will + /// automatically handle the decompression. + /// + /// # Returns + /// + /// A result containing a `BufReader>` on success, or a `FileError` on failure. + /// + pub fn reader(&self) -> io::Result>> { + let file = File::open(self.filepath.clone())?; + //let is_gzipped_name = self.filepath.ends_with(".gz"); + let is_gzipped = is_gzipped_file(&self.filepath)?; + let reader: Box = if is_gzipped { + Box::new(GzDecoder::new(file)) + } else { + Box::new(file) + }; + Ok(BufReader::new(reader)) + } + + /// Collects comment lines and/or a line at the start of the file. + pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result { + let mut buf_reader = self.reader()?; + let mut comments = Vec::new(); + let mut line = String::new(); + + while buf_reader.read_line(&mut line)? > 0 { + if line.starts_with(comment) { + comments.push(line.trim_end().to_string()); + self.skip_lines += 1; + } else if let Some(header_string) = header { + if line.starts_with(header_string) { + self.header = Some(line.trim_end().to_string()); + self.skip_lines += 1; + // We only handle one header line. If there are more, the + // file is *very* poorly formatted. So just let downstream + // parsing errors catch this. In the future, we could have a specialized + // error. + break; + } + // break on the first non-header/comment line + break; + } + line.clear(); + } + + self.comments = Some(comments); + Ok(self.skip_lines > 0) + } + + /// Method to continue reading after skipping the comment and header lines. + pub fn continue_reading(&self) -> io::Result>> { + let mut buf_reader = self.reader()?; + let mut skipped_lines = 0; + let mut line = String::new(); + + // skip the lines that were previously read as comments or header + while skipped_lines < self.skip_lines { + buf_reader.read_line(&mut line)?; + skipped_lines += 1; + line.clear(); + } + Ok(buf_reader) + } +} + +/// Represents an output file. +/// +/// This struct is used to handle operations on an output file, such as writing to the file. +/// This abstracts writing both plaintext and gzip-compressed files. +pub struct OutputFile { + pub filepath: String, + pub header: Option>, +} + +impl OutputFile { + /// Constructs a new `OutputFile`. + /// + /// # Arguments + /// + /// * `filepath` - A string slice that holds the path to the file. If the file extension is + /// `.gz`, `OutputFile` will automatically write gzip-compressed output. + /// * `header` - An optional vector of strings representing commented header lines to be written to the file. + pub fn new(filepath: &str, header: Option>) -> Self { + Self { + filepath: filepath.to_string(), + header, + } + } + + /// Opens the file and returns a writer. + /// + /// If the file path ends with ".gz", the file is treated as gzip-compressed, and the + /// function will handle compression automatically. If a header is set, it will be written + /// to the file. + /// + /// # Returns + /// + /// A result containing a `Box` on success, or an `io::Error` on failure. + pub fn writer(&self) -> io::Result> { + let outfile = &self.filepath; + let is_gzip = outfile.ends_with(".gz"); + let mut writer: Box = if is_gzip { + Box::new(BufWriter::new(GzEncoder::new( + File::create(outfile)?, + Compression::default(), + ))) + } else { + Box::new(BufWriter::new(File::create(outfile)?)) + }; + // write header if one is set + if let Some(entries) = &self.header { + for entry in entries { + writeln!(writer, "#{}", entry)?; + } + } + Ok(writer) + } +} diff --git a/src/io/mod.rs b/src/io/mod.rs new file mode 100644 index 0000000..9542972 --- /dev/null +++ b/src/io/mod.rs @@ -0,0 +1,9 @@ +//! Input/Output +//! + +pub mod io; +pub mod noodles; +pub mod parsers; + +pub use io::{InputFile, OutputFile}; +pub use parsers::{Bed3RecordIterator, RangeRecord, TsvRecordIterator}; diff --git a/src/io/noodles.rs b/src/io/noodles.rs new file mode 100644 index 0000000..242e308 --- /dev/null +++ b/src/io/noodles.rs @@ -0,0 +1,12 @@ +//! + +use crate::Position; +use noodles::core::Position as NoodlesPosition; + +/// Convert from [`noodles::core::Position`], which is [1-based +/// indexing](https://docs.rs/noodles-core/latest/noodles_core/position/struct.Position.html) +/// to GRanges internal 0-based indexing. +pub fn convert_noodles_position(value: NoodlesPosition) -> Position { + let one_based_position: Position = value.get().try_into().unwrap(); + one_based_position - 1 +} diff --git a/src/io/parsers.rs b/src/io/parsers.rs new file mode 100644 index 0000000..e7d2421 --- /dev/null +++ b/src/io/parsers.rs @@ -0,0 +1,268 @@ +//! Functionality for general parsing, by turning BED-like files into iterators. +//! + +use noodles::bed::{self}; +use std::collections::HashSet; +use std::io::{BufRead, BufReader, Read}; + +use crate::error::GRangesError; +use crate::io::io::InputFile; +use crate::ranges::RangeRecord; +use crate::traits::GeneralRangeRecordIterator; + +use super::noodles::convert_noodles_position; + +pub struct Bed3RecordIterator { + bed_reader: bed::Reader>>, +} + +impl Bed3RecordIterator { + pub fn new(filepath: &str) -> Result { + let input_file = InputFile::new(filepath); + let reader = input_file.continue_reading()?; + let bed_reader = bed::Reader::new(reader); + Ok(Self { bed_reader }) + } + + pub fn from_reader(reader: BufReader>) -> Self { + let bed_reader = bed::Reader::new(reader); + Self { bed_reader } + } +} + +impl Iterator for Bed3RecordIterator { + type Item = Result>, GRangesError>; + + fn next(&mut self) -> Option { + self.bed_reader.records::<3>().next().map(|res| { + res.map_err(GRangesError::IOError) + .map(|record| RangeRecord { + seqname: record.reference_sequence_name().to_string(), + first: convert_noodles_position(record.start_position()), + last: convert_noodles_position(record.end_position()), + data: None, + }) + }) + } +} + +impl GeneralRangeRecordIterator> for Bed3RecordIterator { + fn retain_seqnames(self, seqnames: Vec) -> FilteredIntervals> { + FilteredIntervals::new(self, Some(seqnames), None) + } + + fn exclude_seqnames(self, seqnames: Vec) -> FilteredIntervals> { + FilteredIntervals::new(self, None, Some(seqnames)) + } +} + +pub struct TsvRecordIterator { + reader: BufReader>, + parser: F, + phantom: std::marker::PhantomData, +} + +impl TsvRecordIterator +where + F: Fn(&str) -> Result, GRangesError>, +{ + pub fn new(filepath: &str, parser: F) -> Result { + let input_file = InputFile::new(filepath); + let reader = input_file.continue_reading()?; + + Ok(Self { + reader, + parser, + phantom: std::marker::PhantomData, + }) + } +} + +impl Iterator for TsvRecordIterator +where + F: Fn(&str) -> Result, GRangesError>, +{ + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + let mut line = String::new(); + match self.reader.read_line(&mut line) { + Ok(0) => None, + Ok(_) => { + let line = line.trim_end(); + Some((self.parser)(line)) + } + Err(e) => Some(Err(GRangesError::IOError(e))), + } + } +} + +/// An iterator over [`IntervalRecord`] items that filters based on sequence name. +/// +/// Note that that the exclude filter is prioritized over the retain filter. So, +/// if a pipeline contains both, then if a chromosome is supplied to +/// [`GeneralIntervalIterator.exclude_seqnames()`], then even if it is in retain, +/// it is skipped. This is to prevent validation and returning a [`Result`]. +/// +/// # Example +/// +/// ``` +/// use granges::prelude::*; +/// use noodles::bed; +/// +/// let iter = Bed3RecordIterator::new("tests_data/example.bed") +/// .expect("error reading file") +/// .exclude_seqnames(vec!["chr1".to_string()]); +/// +/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 }; +/// let gr = GRanges::from_iter(iter, seqlens) +/// .expect("parsing error"); +/// let mut iter = gr.iter_ranges(); +/// +/// // the first range should be the third range in the file, +/// // chr2:4-5 +/// assert_eq!(iter.next().unwrap().first, 4); +/// assert_eq!(iter.next().unwrap().last, 5); +/// ``` +pub struct FilteredIntervals +where + I: Iterator, GRangesError>>, +{ + inner: I, + retain_seqnames: Option>, + exclude_seqnames: Option>, +} + +impl FilteredIntervals +where + I: Iterator, GRangesError>>, +{ + pub fn new( + inner: I, + retain_seqnames: Option>, + exclude_seqnames: Option>, + ) -> Self { + let retain_seqnames = retain_seqnames.map(HashSet::from_iter); + let exclude_seqnames = exclude_seqnames.map(HashSet::from_iter); + Self { + inner, + retain_seqnames, + exclude_seqnames, + } + } +} + +impl Iterator for FilteredIntervals +where + I: Iterator, GRangesError>>, +{ + type Item = Result, GRangesError>; + + /// Get the next filtered entry, prioritizing exclude over retain. + fn next(&mut self) -> Option { + for item in self.inner.by_ref() { + match &item { + Ok(entry) => { + if self + .exclude_seqnames + .as_ref() + .map_or(false, |ex| ex.contains(&entry.seqname)) + { + continue; + } + if self + .retain_seqnames + .as_ref() + .map_or(true, |rt| rt.contains(&entry.seqname)) + { + return Some(item); + } + } + Err(_) => return Some(item), + } + } + None + } +} + +/// Parses a single column from a string slice into a specified type. +/// +/// This function is particularly useful for converting columns in genomic data files +/// from string representation to a specific type like `usize`, `f64`, etc. +/// +/// # Type Parameters +/// +/// * `T`: The type to which the column should be parsed. It must implement `FromStr`. +/// +/// # Arguments +/// +/// * `column`: The string slice representing the column to be parsed. +/// * `line`: The entire line from which the column was extracted. Used for error reporting. +/// +/// # Errors +/// +/// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`. +pub fn parse_column(column: &str, line: &str) -> Result +where + ::Err: std::fmt::Debug, +{ + column + .parse::() + .map_err(|_| GRangesError::InvalidColumnType(format!("column '{}' in '{}'", column, line))) +} + +/// Parses a BED-like format line into its constituent components. +/// +/// BED-like formats contain BED3 columns (chromosome, start, and end) and +/// others. It's an non-standard but extensible (since it's just a TSV), and sensible +/// for a variety of genomic data. +/// +/// # Arguments +/// +/// * `line`: A string slice representing a line in BED3 format. +/// +/// # Errors +/// +/// Returns `GRangesError::InvalidColumnType` if the line does not have enough columns +/// or if any column cannot be correctly parsed. +pub fn parse_bedlike(line: &str) -> Result<(String, i32, i32, Vec<&str>), GRangesError> { + let columns: Vec<&str> = line.split('\t').collect(); + if columns.len() < 3 { + return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); + } + + let chrom = parse_column(columns[0], line)?; + let start: i32 = parse_column(columns[1], line)?; + let end: i32 = parse_column(columns[2], line)?; + + // Collect remaining columns + let additional_columns = columns[3..].to_vec(); + + Ok((chrom, start, end - 1, additional_columns)) +} + +#[cfg(test)] +mod tests { + use crate::{granges::GRanges, io::Bed3RecordIterator, seqlens}; + + #[test] + fn test_parser() { + // based on this example: https://docs.rs/noodles-bed/latest/noodles_bed/struct.Reader.html#method.records + let iter = Bed3RecordIterator::new("tests_data/noodles_example.bed").unwrap(); + + let seqlens = seqlens! { "sq0" => 10 }; + + let gr = GRanges::from_iter(iter, seqlens).unwrap(); + + assert_eq!(gr.len(), 2); + + // let mut gr_iter = gr.iter_ranges(); + // let first_interval = gr_iter.next().unwrap(); + // assert_eq!(first_interval.first, 7); + // assert_eq!(first_interval.last, 12); + // + // let second_interval = gr_iter.next().unwrap(); + // assert_eq!(second_interval.first, 20); + // assert_eq!(second_interval.last, 33); + } +} diff --git a/src/iterators.rs b/src/iterators.rs index 58507fe..eb4ddb1 100644 --- a/src/iterators.rs +++ b/src/iterators.rs @@ -1,11 +1,51 @@ -/// the [`rangesiterable`] trait defines common functionality for iterating over -/// the copyable range types. -pub trait RangesIterable { - fn iter_ranges(&self) -> Box + '_>; -} +use std::marker::PhantomData; + +use genomap::GenomeMap; + +use crate::{ranges::RangeRefRecord, traits::RangeContainer}; -pub trait RangesIntoIterable { - fn into_iter_ranges(self) -> Box>; + +pub struct GenomicRangeIterator<'a, R, U> { + ranges: &'a GenomeMap, + seqnames: Vec, + current_seqname_index: usize, + current_range_index: usize, + _marker: PhantomData<&'a U>, } +impl<'a, R, U> Iterator for GenomicRangeIterator<'a, R, U> +where U: 'a, + R: RangeContainer { + type Item = RangeRefRecord<'a, U>; + + fn next(&mut self) -> Option { + // If there are no more seqnames left, return None + if self.current_seqname_index >= self.seqnames.len() { + return None; + } + + let seqname = &self.seqnames[self.current_seqname_index]; + let ranges = self.ranges.get(seqname)?; + + // If the current range index is beyond the current seqname's ranges, move to the next seqname + if self.current_range_index >= ranges.len() { + self.current_seqname_index += 1; + self.current_range_index = 0; + // Recursively call next() to handle the next seqname + return self.next(); + } + + let range = &ranges[self.current_range_index]; + let data = self.genomic_ranges.data.get(range.index)?; + + self.current_range_index += 1; + + Some(RangeRecord { + seqname: seqname.clone(), + first: range.start, + last: range.end, + data, + }) + } +} diff --git a/src/lib.rs b/src/lib.rs index 60177cb..d8f6e08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,8 +2,11 @@ #![crate_name = "granges"] #![doc(html_root_url = "https://docs.rs/granges/")] +pub use indexmap; + pub mod error; pub mod granges; +pub mod io; pub mod iterators; pub mod join; pub mod ranges; @@ -15,12 +18,23 @@ pub type Position = u32; pub mod prelude { pub use crate::error::GRangesError; pub use crate::granges::GRanges; - pub use crate::iterators::RangesIterable; + pub use crate::io::{Bed3RecordIterator, RangeRecord, TsvRecordIterator}; pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; + pub use crate::traits::{GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable}; + + pub use crate::seqlens; +} + +/// Create and [`indexmap::IndexMap`] of sequence names and their lengths. +#[macro_export] +macro_rules! seqlens { + ($($key:expr => $value:expr),* $(,)?) => { + $crate::indexmap::indexmap!($($key.to_string() => $value),*) + }; } -/// Create a new `GRanges` with sequence length information (used primarily for small examples) +/// Create a new `GRanges` with sequence length information (used primarily for small examples) #[macro_export] macro_rules! create_granges_with_seqlens { ($range_type:ty, $data_type:ty, { $($chr:expr => [$(($start:expr, $end:expr, $data:expr)),*]),* }, seqlens: { $($chr_len:expr => $len:expr),* }) => { diff --git a/src/main/commands.rs b/src/main/commands.rs new file mode 100644 index 0000000..6d7d34c --- /dev/null +++ b/src/main/commands.rs @@ -0,0 +1,4 @@ +use std::path::PathBuf; + +/// Adjust the genomic ranges in a bedfile by some specified amount. +pub fn granges_adjust(bedfile: &PathBuf, seqlens: &PathBuf, both: &i32, output: Option<&PathBuf>) {} diff --git a/src/main/mod.rs b/src/main/mod.rs new file mode 100644 index 0000000..3ec7cb6 --- /dev/null +++ b/src/main/mod.rs @@ -0,0 +1,71 @@ +use std::path::PathBuf; + +use clap::{Parser, Subcommand}; +use granges::prelude::GRangesError; + +pub mod commands; +use crate::commands::granges_adjust; + +const INFO: &str = "\ +granges: genomic range operations built off of the GRanges library +usage: granges [--help] + +Subcommands: + + adjust: adjust each genomic range, e.g. to add a kilobase to each end. + +"; + +#[derive(Parser)] +#[clap(name = "granges")] +#[clap(about = INFO)] +struct Cli { + #[arg(short, long, action = clap::ArgAction::Count)] + debug: u8, + + #[command(subcommand)] + command: Option, +} + +#[derive(Subcommand)] +enum Commands { + Adjust { + /// a TSV genome file of chromosome names and their lengths + #[arg(long, required = true)] + seqlens: PathBuf, + /// an input BED-like TSV file + #[arg(required = true)] + bedfile: PathBuf, + /// number of basepairs to expand the range start and end positions by + #[arg(long)] + both: i32, + /// an optional output file (standard output will be used if not specified) + output: Option, + }, +} + +fn run() -> Result<(), GRangesError> { + let cli = Cli::parse(); + match &cli.command { + Some(Commands::Adjust { + bedfile, + seqlens, + both, + output, + }) => Ok(granges_adjust(bedfile, seqlens, both, output.as_ref())), + None => { + println!("{}\n", INFO); + std::process::exit(1); + } + } +} + +fn main() { + match run() { + Ok(_) => {} + Err(e) => { + eprintln!("Error: {}", e); + std::process::exit(1); + } + } +} diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs index c9311f4..cc18309 100644 --- a/src/ranges/coitrees.rs +++ b/src/ranges/coitrees.rs @@ -1,6 +1,6 @@ use coitrees::{BasicCOITree, GenericInterval, Interval, IntervalNode, IntervalTree}; -use crate::{error::GRangesError, iterators::{RangesIterable, RangesIntoIterable}, traits::RangeContainer, Position}; +use crate::{error::GRangesError, traits::RangeContainer, traits::RangesIterable, Position}; use super::{validate_range, vec::VecRanges, RangeEmpty, RangeIndexed}; @@ -122,11 +122,10 @@ impl From> for RangeIndexed { } -/// /// # Developer Notes /// /// Internally, the [`coitrees`] iterator is over their interval type. -/// Their iterator does not consume, but we need an owned (or copyable, +/// Their iterator does not consume, but we need an owned (or copyable, /// as is case here) type to map out. Thus, we need this odd variant of /// an iterator that doesn't return references and does not consume. impl RangesIterable for COITrees { @@ -148,10 +147,16 @@ impl RangesIterable for COITrees<()> { #[cfg(test)] mod tests { use crate::prelude::*; + use crate::ranges::RangeIndexed; use crate::test_utilities::{granges_test_case_01, random_coitrees}; #[test] fn test_ranges_iterable_coitrees() { - let gr = granges_test_case_01(); + let gr = granges_test_case_01().to_coitrees().unwrap(); + let mut chr1_iter = gr.get_ranges("chr1").unwrap().iter_ranges(); + assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(0, 5, 0)); + assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(4, 7, 1)); + assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(10, 17, 2)); + assert!(chr1_iter.next().is_none()); } } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index f527f40..185b7e5 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -7,7 +7,7 @@ use crate::{error::GRangesError, Position}; pub mod coitrees; pub mod vec; -#[derive(Clone, Default)] +#[derive(Clone, Debug, Default, PartialEq)] pub struct RangeEmpty { pub start: Position, pub end: Position, @@ -23,7 +23,7 @@ impl RangeEmpty { } } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq)] pub struct RangeIndexed { pub start: Position, pub end: Position, @@ -40,6 +40,25 @@ impl RangeIndexed { } } +/// Represents a parsed range entry, possibly containing some data. +#[derive(Debug, Clone)] +pub struct RangeRecord { + pub seqname: String, + pub first: Position, + pub last: Position, + pub data: U, +} + + +/// Represents a range entry, possible with a reference to some data. +#[derive(Debug, Clone)] +pub struct RangeRefRecord<'a, U> { + pub seqname: &'a str, + pub first: Position, + pub last: Position, + pub data: &'a U, +} + /// Validates whether a given range is valid for accessing a sequence of a given `length`. /// /// # Arguments diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs index e31b082..c0e74d8 100644 --- a/src/ranges/vec.rs +++ b/src/ranges/vec.rs @@ -1,6 +1,6 @@ -use crate::{error::GRangesError, traits::RangeContainer, Position}; -use crate::iterators::{RangesIterable, RangesIntoIterable}; use super::{validate_range, RangeEmpty, RangeIndexed}; +use crate::traits::{RangesIntoIterable, RangesIterable}; +use crate::{error::GRangesError, traits::RangeContainer, Position}; pub type VecRangesIndexed = VecRanges; pub type VecRangesEmpty = VecRanges; @@ -47,10 +47,19 @@ impl RangeContainer for VecRanges { } } - impl RangesIntoIterable for VecRanges { fn into_iter_ranges(self) -> Box> { let iter = self.ranges.into_iter(); Box::new(iter) } } + + +impl RangesIterable for VecRanges { + fn iter_ranges(&self) -> Box + '_> { + let iter = self.ranges.iter(); + let converted_iter = iter.map(|interval| RangeIndexed::from(interval)); + Box::new(converted_iter) + } +} + diff --git a/src/traits.rs b/src/traits.rs index 53087bd..751bafa 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1,6 +1,89 @@ +//! Traits used by the GRanges library. +//! + +use crate::{ + error::GRangesError, + io::parsers::{FilteredIntervals, RangeRecord}, + Position, +}; + pub trait RangeContainer { fn len(&self) -> usize; fn is_empty(&self) -> bool { self.len() == 0 } } + +pub trait GenericRange { + fn index(&self) -> Option; + fn start(&self) -> Position; + fn end(&self) -> Position; +} + +pub trait GeneralRangeRecordIterator: + Iterator, GRangesError>> + Sized +{ + fn retain_seqnames(self, seqnames: Vec) -> FilteredIntervals; + fn exclude_seqnames(self, seqnames: Vec) -> FilteredIntervals; +} + +/// The [`RangesIterable`] trait defines common functionality for iterating over +/// the range types in range containers. +pub trait RangesIterable { + fn iter_ranges(&self) -> Box + '_>; +} + +/// The [`RangesIntoIterable`] trait defines common functionality for *consuming* iterating +/// over the range types in range containers. +pub trait RangesIntoIterable { + fn into_iter_ranges(self) -> Box>; +} + +/// The main [`IndexedDataContainer`] type is used to interact +/// with arbitrary *indexed* container of data of type `DataContainer::Item`. +/// +/// GRanges provides [`IndexedDataContainer`] trait implementations for: +/// +/// - [`ndarray::Array1`] +/// - [`ndarray::Array2`] +/// - [`Vec`] +/// - [`polars::DataFrame`] +/// +/// # Panics +/// This will panic if `DataContainer.get_value()` tries to access +/// an index that does not exist. This is an explicit design choice: +/// we could have returned an `Option` or a `Result`. However, +/// when the index is invalid, there is a serious problem in the creation +/// of the `GRanges` object's set of ranges; this should not ever +/// happen in normal operation. Returning `Option`/`Result` just to +/// protect against this edge case would force most `GRanges` +/// functions to return a `Result`. This would clog the API usage, so +/// we opt to just panic. +pub trait IndexedDataContainer<'a> { + // note: I don't think we can refactor out this lifetime, + // since it is needed often for associated types + type Item; + type Output; + // this type is needed to reference the core underlying type, + // eg to handle reference types + fn is_valid_index(&self, index: usize) -> bool; + fn get_value(&'a self, index: usize) -> ::Item; + fn len(&self) -> usize; + fn is_empty(&self) -> bool { + self.len() == 0 + } + fn invalid_indices(&self, indices: &[usize]) -> Vec { + let mut invalid = Vec::new(); + for &index in indices { + if !self.is_valid_index(index) { + invalid.push(index); + } + } + invalid + } + + /// Create a new data container using a set of range indices. + fn new_from_indices(&self, indices: &[usize]) -> Self::Output; +} + + diff --git a/tests_data/example.bed b/tests_data/example.bed new file mode 100644 index 0000000..0f40e10 --- /dev/null +++ b/tests_data/example.bed @@ -0,0 +1,5 @@ +chr1 10 20 +chr1 14 18 +chr2 4 5 +chr2 5 6 +chr4 9 15 diff --git a/tests_data/noodles_example.bed b/tests_data/noodles_example.bed new file mode 100644 index 0000000..eccc0b9 --- /dev/null +++ b/tests_data/noodles_example.bed @@ -0,0 +1,2 @@ +sq0 7 13 +sq0 20 34