temporary commit

vsbuffalo · Feb 15, 2024 · 2a644e8 · 2a644e8
1 parent 1b5e236
commit 2a644e8
Show file tree

Hide file tree

Showing 20 changed files with 884 additions and 24 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,12 +2,32 @@
 name = "granges"
 version = "0.1.0"
 edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+license = "MIT"
+authors = ["Vince Buffalo <[email protected]>"]
+keywords = ["genomics", "bioinformatics", "compbio"]
+categories = ["science"]
+documentation = "https://docs.rs/granges/"
+repository = "https://github.com/vsbuffalo/granges"
+description = "A Rust library and command line tool for genomic range operations."
 
 [dependencies]
+# clap = { version = "4.4.18", features = ["derive"], optional = true }
+clap = { version = "4.4.18", features = ["derive"] }
 coitrees = { version = "0.4.0", features = ["nosimd"] }
+flate2 = "1.0.28"
 genomap = "0.2.6"
 indexmap = "2.2.3"
+ndarray = "0.15.6"
+noodles = { version = "0.63.0", features = ["core", "bed"] }
 rand = "0.8.5"
 thiserror = "1.0.57"
+
+# [features]
+# cli = [ "clap" ]
+
+[[bin]]
+name = "granges"
+path = "src/main/mod.rs"
+# required-features = ["cli"]
+
+
diff --git a/src/data/mod.rs b/src/data/mod.rs
@@ -0,0 +1,3 @@
+//! Data container implementations.
+
+pub mod vec;
diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs
@@ -0,0 +1,62 @@
+//! Data container implementations for [`ndarray::Array1`] and [`ndarray::Array2`].
+
+use ndarray::{Array1, Array2, ArrayView1};
+use crate::traits::IndexedDataContainer;
+
+impl<'a, U> IndexedDataContainer<'a> for Array1<U>
+where
+ U: Copy + Default + 'a,
+{
+ type Item = U;
+ type Output = Array1<U>;
+
+ fn get_value(&'a self, index: usize) -> Self::Item {
+ self[index]
+ }
+
+ fn len(&self) -> usize {
+ self.len()
+ }
+
+ fn is_valid_index(&self, index: usize) -> bool {
+ index < self.shape()[0]
+ }
+
+ fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+ Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
+ }
+}
+
+impl<'a, U> IndexedDataContainer<'a> for Array2<U>
+where
+ U: Copy + Default + 'a,
+{
+ type Item = ArrayView1<'a, U>;
+ type Output = Array2<U>;
+
+ fn get_value(&'a self, index: usize) -> Self::Item {
+ self.row(index)
+ }
+
+ fn len(&self) -> usize {
+ self.shape()[0]
+ }
+
+ fn is_valid_index(&self, index: usize) -> bool {
+ index < self.shape()[0]
+ }
+
+ fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+ let cols = self.shape()[1];
+
+ let rows_data: Vec<U> = indices
+ .iter()
+ .flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
+ .collect();
+
+ // create a new Array2<U> from the rows
+ // shape is (number of indices, number of columns)
+ Array2::from_shape_vec((indices.len(), cols), rows_data)
+ .expect("Shape and collected data size mismatch")
+ }
+}
diff --git a/src/data/vec.rs b/src/data/vec.rs
@@ -0,0 +1,33 @@
+//! Data container implementations for [`Vec<U>`].
+
+
+/// Trait methods for the commonly-used `Vec<U>` data container.
+///
+/// Note that the associated `Item` type is always a *reference* to the data elements.
+impl<'a, U> IndexedDataContainer<'a> for Vec<U>
+where
+ U: Clone + 'a,
+{
+ type Item = &'a U;
+ type Output = Vec<U>;
+
+ fn get_value(&'a self, index: usize) -> Self::Item {
+ self.get(index).unwrap()
+ }
+
+ fn len(&self) -> usize {
+ self.len()
+ }
+
+ fn is_valid_index(&self, index: usize) -> bool {
+ self.get(index).is_some()
+ }
+
+ fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+ Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
+ }
+}
+
+
+
+
diff --git a/src/error.rs b/src/error.rs
@@ -5,15 +5,23 @@ use crate::Position;
 
 #[derive(Debug, Error)]
 pub enum GRangesError {
+ // IO related errors
+ #[error("File reading eror: {0}")]
+ IOError(#[from] std::io::Error),
+
+ // File parsing related errors
+ #[error("Bed-like file has too few columns. The first three columns must be sequence name, and start and end positions.\nLine: {0}")]
+ BedlikeTooFewColumns(String),
+ #[error("File has invalid column type entry: {0}")]
+ InvalidColumnType(String),
+
+ // Invalid genomic range errors
  #[error("Range invalid: start ({0}) must be greater than end ({1})")]
  InvalidGenomicRange(Position, Position),
-
  #[error("Range [{0}, {1}] is invalid for sequence of length {2}")]
  InvalidGenomicRangeForSequence(Position, Position, Position),
-
  #[error("Sequence name '{0}' is not the ranges container")]
  MissingSequence(String),
-
  #[error("Error encountered in genomap::GenomeMap")]
  GenomeMapError(#[from] GenomeMapError),
 }
diff --git a/src/granges.rs b/src/granges.rs
@@ -2,13 +2,14 @@ use genomap::GenomeMap;
 use indexmap::IndexMap;
 
 use crate::{
+ io::RangeRecord,
  prelude::GRangesError,
  ranges::{
  coitrees::{COITrees, COITreesIndexed},
  vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
  RangeEmpty, RangeIndexed,
  },
- traits::RangeContainer,
+ traits::{RangeContainer, RangesIterable, IndexedDataContainer},
  Position,
 };
 
@@ -31,6 +32,10 @@ where
  pub fn is_empty(&self) -> bool {
  self.len() == 0
  }
+
+ pub fn get_ranges(&self, seqname: &str) -> Option<&C> {
+ self.ranges.get(seqname)
+ }
 }
 
 impl<U> GRanges<VecRangesIndexed, Vec<U>> {
@@ -105,6 +110,23 @@ impl GRanges<VecRangesEmpty, ()> {
  }
 }
 
+impl<U> GRanges<VecRangesIndexed, Vec<U>> {
+ pub fn from_iter<I>(
+ iter: I,
+ seqlens: IndexMap<String, Position>,
+ ) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError>
+ where
+ I: Iterator<Item = Result<RangeRecord<U>, GRangesError>>,
+ {
+ let mut gr = GRanges::new_vec(&seqlens);
+ for possible_entry in iter {
+ let entry = possible_entry?;
+ gr.push_range_with_data(&entry.seqname, entry.first, entry.last, entry.data)?;
+ }
+ Ok(gr)
+ }
+}
+
 impl<T> GRanges<VecRangesIndexed, T> {
  /// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree 
  /// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library
@@ -123,6 +145,7 @@ impl<T> GRanges<VecRangesIndexed, T> {
  }
 }
 
+
 #[cfg(test)]
 mod tests {
  use indexmap::indexmap;

diff --git a/src/io/io.rs b/src/io/io.rs
@@ -0,0 +1,170 @@
+//! Input/Output file handling with [`InputFile`] and [`OutputFile`].
+//!
+//! These types abstract over reading/writing both plaintext and gzip-compressed
+//! input/output.
+
+use flate2::read::GzDecoder;
+use flate2::write::GzEncoder;
+use flate2::Compression;
+use std::fs::File;
+use std::io::Write;
+use std::io::{self, BufWriter};
+use std::io::{BufRead, BufReader, Read};
+
+/// Check if a file is a gzipped by looking for the magic numbers
+fn is_gzipped_file(file_path: &str) -> io::Result<bool> {
+ let mut file = File::open(file_path)?;
+ let mut buffer = [0; 2];
+ file.read_exact(&mut buffer)?;
+
+ Ok(buffer == [0x1f, 0x8b])
+}
+
+/// Represents an input file.
+///
+/// This struct is used to handle operations on an input file, such as reading from the file.
+/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input
+/// to be read through a common interface.
+pub struct InputFile {
+ pub filepath: String,
+ pub comments: Option<Vec<String>>,
+ pub header: Option<String>,
+ pub skip_lines: usize,
+}
+
+impl InputFile {
+ /// Constructs a new `InputFile`.
+ ///
+ /// # Arguments
+ ///
+ /// * `filepath` - A string slice that holds the path to the file. If the file extension is
+ /// `.gz`, `InputFile` will automatically uncompress the input.
+ pub fn new(filepath: &str) -> Self {
+ Self {
+ filepath: filepath.to_string(),
+ comments: None,
+ header: None,
+ skip_lines: 0,
+ }
+ }
+
+ /// Opens the file and returns a buffered reader.
+ ///
+ /// If the file is gzip-compressed (indicated by a ".gz" extension), this method will
+ /// automatically handle the decompression.
+ ///
+ /// # Returns
+ ///
+ /// A result containing a `BufReader<Box<dyn Read>>` on success, or a `FileError` on failure.
+ ///
+ pub fn reader(&self) -> io::Result<BufReader<Box<dyn Read>>> {
+ let file = File::open(self.filepath.clone())?;
+ //let is_gzipped_name = self.filepath.ends_with(".gz");
+ let is_gzipped = is_gzipped_file(&self.filepath)?;
+ let reader: Box<dyn Read> = if is_gzipped {
+ Box::new(GzDecoder::new(file))
+ } else {
+ Box::new(file)
+ };
+ Ok(BufReader::new(reader))
+ }
+
+ /// Collects comment lines and/or a line at the start of the file.
+ pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result<bool> {
+ let mut buf_reader = self.reader()?;
+ let mut comments = Vec::new();
+ let mut line = String::new();
+
+ while buf_reader.read_line(&mut line)? > 0 {
+ if line.starts_with(comment) {
+ comments.push(line.trim_end().to_string());
+ self.skip_lines += 1;
+ } else if let Some(header_string) = header {
+ if line.starts_with(header_string) {
+ self.header = Some(line.trim_end().to_string());
+ self.skip_lines += 1;
+ // We only handle one header line. If there are more, the
+ // file is *very* poorly formatted. So just let downstream
+ // parsing errors catch this. In the future, we could have a specialized
+ // error.
+ break;
+ }
+ // break on the first non-header/comment line
+ break;
+ }
+ line.clear();
+ }
+
+ self.comments = Some(comments);
+ Ok(self.skip_lines > 0)
+ }
+
+ /// Method to continue reading after skipping the comment and header lines.
+ pub fn continue_reading(&self) -> io::Result<BufReader<Box<dyn Read>>> {
+ let mut buf_reader = self.reader()?;
+ let mut skipped_lines = 0;
+ let mut line = String::new();
+
+ // skip the lines that were previously read as comments or header
+ while skipped_lines < self.skip_lines {
+ buf_reader.read_line(&mut line)?;
+ skipped_lines += 1;
+ line.clear();
+ }
+ Ok(buf_reader)
+ }
+}
+
+/// Represents an output file.
+///
+/// This struct is used to handle operations on an output file, such as writing to the file.
+/// This abstracts writing both plaintext and gzip-compressed files.
+pub struct OutputFile {
+ pub filepath: String,
+ pub header: Option<Vec<String>>,
+}
+
+impl OutputFile {
+ /// Constructs a new `OutputFile`.
+ ///
+ /// # Arguments
+ ///
+ /// * `filepath` - A string slice that holds the path to the file. If the file extension is
+ /// `.gz`, `OutputFile` will automatically write gzip-compressed output.
+ /// * `header` - An optional vector of strings representing commented header lines to be written to the file.
+ pub fn new(filepath: &str, header: Option<Vec<String>>) -> Self {
+ Self {
+ filepath: filepath.to_string(),
+ header,
+ }
+ }
+
+ /// Opens the file and returns a writer.
+ ///
+ /// If the file path ends with ".gz", the file is treated as gzip-compressed, and the
+ /// function will handle compression automatically. If a header is set, it will be written
+ /// to the file.
+ ///
+ /// # Returns
+ ///
+ /// A result containing a `Box<dyn Write>` on success, or an `io::Error` on failure.
+ pub fn writer(&self) -> io::Result<Box<dyn Write>> {
+ let outfile = &self.filepath;
+ let is_gzip = outfile.ends_with(".gz");
+ let mut writer: Box<dyn Write> = if is_gzip {
+ Box::new(BufWriter::new(GzEncoder::new(
+ File::create(outfile)?,
+ Compression::default(),
+ )))
+ } else {
+ Box::new(BufWriter::new(File::create(outfile)?))
+ };
+ // write header if one is set
+ if let Some(entries) = &self.header {
+ for entry in entries {
+ writeln!(writer, "#{}", entry)?;
+ }
+ }
+ Ok(writer)
+ }
+}