Skip to content

Commit

Permalink
temporary commit
Browse files Browse the repository at this point in the history
  • Loading branch information
vsbuffalo committed Feb 15, 2024
1 parent 1b5e236 commit 2a644e8
Show file tree
Hide file tree
Showing 20 changed files with 884 additions and 24 deletions.
24 changes: 22 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,32 @@
name = "granges"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
license = "MIT"
authors = ["Vince Buffalo <[email protected]>"]
keywords = ["genomics", "bioinformatics", "compbio"]
categories = ["science"]
documentation = "https://docs.rs/granges/"
repository = "https://github.com/vsbuffalo/granges"
description = "A Rust library and command line tool for genomic range operations."

[dependencies]
# clap = { version = "4.4.18", features = ["derive"], optional = true }
clap = { version = "4.4.18", features = ["derive"] }
coitrees = { version = "0.4.0", features = ["nosimd"] }
flate2 = "1.0.28"
genomap = "0.2.6"
indexmap = "2.2.3"
ndarray = "0.15.6"
noodles = { version = "0.63.0", features = ["core", "bed"] }
rand = "0.8.5"
thiserror = "1.0.57"

# [features]
# cli = [ "clap" ]

[[bin]]
name = "granges"
path = "src/main/mod.rs"
# required-features = ["cli"]


3 changes: 3 additions & 0 deletions src/data/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
//! Data container implementations.

pub mod vec;
62 changes: 62 additions & 0 deletions src/data/ndarray.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//! Data container implementations for [`ndarray::Array1`] and [`ndarray::Array2`].

use ndarray::{Array1, Array2, ArrayView1};
use crate::traits::IndexedDataContainer;

impl<'a, U> IndexedDataContainer<'a> for Array1<U>
where
U: Copy + Default + 'a,
{
type Item = U;
type Output = Array1<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
self[index]
}

fn len(&self) -> usize {
self.len()
}

fn is_valid_index(&self, index: usize) -> bool {
index < self.shape()[0]
}

fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
}
}

impl<'a, U> IndexedDataContainer<'a> for Array2<U>
where
U: Copy + Default + 'a,
{
type Item = ArrayView1<'a, U>;
type Output = Array2<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
self.row(index)
}

fn len(&self) -> usize {
self.shape()[0]
}

fn is_valid_index(&self, index: usize) -> bool {
index < self.shape()[0]
}

fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
let cols = self.shape()[1];

let rows_data: Vec<U> = indices
.iter()
.flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
.collect();

// create a new Array2<U> from the rows
// shape is (number of indices, number of columns)
Array2::from_shape_vec((indices.len(), cols), rows_data)
.expect("Shape and collected data size mismatch")
}
}
33 changes: 33 additions & 0 deletions src/data/vec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//! Data container implementations for [`Vec<U>`].


/// Trait methods for the commonly-used `Vec<U>` data container.
///
/// Note that the associated `Item` type is always a *reference* to the data elements.
impl<'a, U> IndexedDataContainer<'a> for Vec<U>
where
U: Clone + 'a,
{
type Item = &'a U;
type Output = Vec<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
self.get(index).unwrap()
}

fn len(&self) -> usize {
self.len()
}

fn is_valid_index(&self, index: usize) -> bool {
self.get(index).is_some()
}

fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
}
}




14 changes: 11 additions & 3 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,23 @@ use crate::Position;

#[derive(Debug, Error)]
pub enum GRangesError {
// IO related errors
#[error("File reading eror: {0}")]
IOError(#[from] std::io::Error),

// File parsing related errors
#[error("Bed-like file has too few columns. The first three columns must be sequence name, and start and end positions.\nLine: {0}")]
BedlikeTooFewColumns(String),
#[error("File has invalid column type entry: {0}")]
InvalidColumnType(String),

// Invalid genomic range errors
#[error("Range invalid: start ({0}) must be greater than end ({1})")]
InvalidGenomicRange(Position, Position),

#[error("Range [{0}, {1}] is invalid for sequence of length {2}")]
InvalidGenomicRangeForSequence(Position, Position, Position),

#[error("Sequence name '{0}' is not the ranges container")]
MissingSequence(String),

#[error("Error encountered in genomap::GenomeMap")]
GenomeMapError(#[from] GenomeMapError),
}
25 changes: 24 additions & 1 deletion src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ use genomap::GenomeMap;
use indexmap::IndexMap;

use crate::{
io::RangeRecord,
prelude::GRangesError,
ranges::{
coitrees::{COITrees, COITreesIndexed},
vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
RangeEmpty, RangeIndexed,
},
traits::RangeContainer,
traits::{RangeContainer, RangesIterable, IndexedDataContainer},
Position,
};

Expand All @@ -31,6 +32,10 @@ where
pub fn is_empty(&self) -> bool {
self.len() == 0
}

pub fn get_ranges(&self, seqname: &str) -> Option<&C> {
self.ranges.get(seqname)
}
}

impl<U> GRanges<VecRangesIndexed, Vec<U>> {
Expand Down Expand Up @@ -105,6 +110,23 @@ impl GRanges<VecRangesEmpty, ()> {
}
}

impl<U> GRanges<VecRangesIndexed, Vec<U>> {
pub fn from_iter<I>(
iter: I,
seqlens: IndexMap<String, Position>,
) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError>
where
I: Iterator<Item = Result<RangeRecord<U>, GRangesError>>,
{
let mut gr = GRanges::new_vec(&seqlens);
for possible_entry in iter {
let entry = possible_entry?;
gr.push_range_with_data(&entry.seqname, entry.first, entry.last, entry.data)?;
}
Ok(gr)
}
}

impl<T> GRanges<VecRangesIndexed, T> {
/// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree
/// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library
Expand All @@ -123,6 +145,7 @@ impl<T> GRanges<VecRangesIndexed, T> {
}
}


#[cfg(test)]
mod tests {
use indexmap::indexmap;
Expand Down
170 changes: 170 additions & 0 deletions src/io/io.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
//! Input/Output file handling with [`InputFile`] and [`OutputFile`].
//!
//! These types abstract over reading/writing both plaintext and gzip-compressed
//! input/output.

use flate2::read::GzDecoder;
use flate2::write::GzEncoder;
use flate2::Compression;
use std::fs::File;
use std::io::Write;
use std::io::{self, BufWriter};
use std::io::{BufRead, BufReader, Read};

/// Check if a file is a gzipped by looking for the magic numbers
fn is_gzipped_file(file_path: &str) -> io::Result<bool> {
let mut file = File::open(file_path)?;
let mut buffer = [0; 2];
file.read_exact(&mut buffer)?;

Ok(buffer == [0x1f, 0x8b])
}

/// Represents an input file.
///
/// This struct is used to handle operations on an input file, such as reading from the file.
/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input
/// to be read through a common interface.
pub struct InputFile {
pub filepath: String,
pub comments: Option<Vec<String>>,
pub header: Option<String>,
pub skip_lines: usize,
}

impl InputFile {
/// Constructs a new `InputFile`.
///
/// # Arguments
///
/// * `filepath` - A string slice that holds the path to the file. If the file extension is
/// `.gz`, `InputFile` will automatically uncompress the input.
pub fn new(filepath: &str) -> Self {
Self {
filepath: filepath.to_string(),
comments: None,
header: None,
skip_lines: 0,
}
}

/// Opens the file and returns a buffered reader.
///
/// If the file is gzip-compressed (indicated by a ".gz" extension), this method will
/// automatically handle the decompression.
///
/// # Returns
///
/// A result containing a `BufReader<Box<dyn Read>>` on success, or a `FileError` on failure.
///
pub fn reader(&self) -> io::Result<BufReader<Box<dyn Read>>> {
let file = File::open(self.filepath.clone())?;
//let is_gzipped_name = self.filepath.ends_with(".gz");
let is_gzipped = is_gzipped_file(&self.filepath)?;
let reader: Box<dyn Read> = if is_gzipped {
Box::new(GzDecoder::new(file))
} else {
Box::new(file)
};
Ok(BufReader::new(reader))
}

/// Collects comment lines and/or a line at the start of the file.
pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result<bool> {
let mut buf_reader = self.reader()?;
let mut comments = Vec::new();
let mut line = String::new();

while buf_reader.read_line(&mut line)? > 0 {
if line.starts_with(comment) {
comments.push(line.trim_end().to_string());
self.skip_lines += 1;
} else if let Some(header_string) = header {
if line.starts_with(header_string) {
self.header = Some(line.trim_end().to_string());
self.skip_lines += 1;
// We only handle one header line. If there are more, the
// file is *very* poorly formatted. So just let downstream
// parsing errors catch this. In the future, we could have a specialized
// error.
break;
}
// break on the first non-header/comment line
break;
}
line.clear();
}

self.comments = Some(comments);
Ok(self.skip_lines > 0)
}

/// Method to continue reading after skipping the comment and header lines.
pub fn continue_reading(&self) -> io::Result<BufReader<Box<dyn Read>>> {
let mut buf_reader = self.reader()?;
let mut skipped_lines = 0;
let mut line = String::new();

// skip the lines that were previously read as comments or header
while skipped_lines < self.skip_lines {
buf_reader.read_line(&mut line)?;
skipped_lines += 1;
line.clear();
}
Ok(buf_reader)
}
}

/// Represents an output file.
///
/// This struct is used to handle operations on an output file, such as writing to the file.
/// This abstracts writing both plaintext and gzip-compressed files.
pub struct OutputFile {
pub filepath: String,
pub header: Option<Vec<String>>,
}

impl OutputFile {
/// Constructs a new `OutputFile`.
///
/// # Arguments
///
/// * `filepath` - A string slice that holds the path to the file. If the file extension is
/// `.gz`, `OutputFile` will automatically write gzip-compressed output.
/// * `header` - An optional vector of strings representing commented header lines to be written to the file.
pub fn new(filepath: &str, header: Option<Vec<String>>) -> Self {
Self {
filepath: filepath.to_string(),
header,
}
}

/// Opens the file and returns a writer.
///
/// If the file path ends with ".gz", the file is treated as gzip-compressed, and the
/// function will handle compression automatically. If a header is set, it will be written
/// to the file.
///
/// # Returns
///
/// A result containing a `Box<dyn Write>` on success, or an `io::Error` on failure.
pub fn writer(&self) -> io::Result<Box<dyn Write>> {
let outfile = &self.filepath;
let is_gzip = outfile.ends_with(".gz");
let mut writer: Box<dyn Write> = if is_gzip {
Box::new(BufWriter::new(GzEncoder::new(
File::create(outfile)?,
Compression::default(),
)))
} else {
Box::new(BufWriter::new(File::create(outfile)?))
};
// write header if one is set
if let Some(entries) = &self.header {
for entry in entries {
writeln!(writer, "#{}", entry)?;
}
}
Ok(writer)
}
}
Loading

0 comments on commit 2a644e8

Please sign in to comment.