Skip to content

Commit

Permalink
added doc
Browse files Browse the repository at this point in the history
  • Loading branch information
vsbuffalo committed Feb 8, 2024
1 parent 014bbfc commit 225d7eb
Show file tree
Hide file tree
Showing 9 changed files with 902 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/target
20 changes: 20 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[package]
name = "recmap"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
csv = "1.3.0"
flate2 = "1.0.28"
genomap = "0.1.3"
indexmap = "2.2.2"
ndarray = "0.15.6"
num-traits = "0.2.17"
serde = { version = "1.0.196", features = ["derive"] }
thiserror = "1.0.56"

[dev-dependencies]
clap = { version = "4.4.18", features = ["derive"] }
tempfile = "3.10.0"
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# RecMap Library for reading and working with recombination maps in Rust

`RecMap` objects can be created from reading in a HapMap-formatted
recombination map. Note that since the HapMap recombination format does
not include the chromosome lengths, this must be specified too.
A convenience function `read_seqlens` is provided to read in TSV-formatted
"genome" files of the chromosome names and lengths.

Here is a example which loads a recombination map from a HapMap-formatted
recombination map and calculates the total map lengths.

```rust
use recmap::prelude::*;
let seqlens = read_seqlens("hg38_seqlens.tsv")
.expect("could not read seqlens");
let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens)
.expect("cannot read hapmap");

for (name, rate_map) in rec_map.iter() {
println!("{}\t{}", name, rate_map.total_map_length().unwrap());
}
```

This example can be run on the command line with:

```bash
cargo run --example calc_map_lengths -- --seqlens hg38_seqlens.tsv decode_2019_map.txt
```

```rust
use recmap::prelude::*;
let seqlens = read_seqlens("hg38_seqlens.tsv")
.expect("could not read seqlens");
let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens)
.expect("cannot read hapmap");

let positions = vec![11975064, 15007450];
rec_map.interpolate_map_positions("chr1", &positions);

```

26 changes: 26 additions & 0 deletions examples/calc_map_lengths.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
use clap::Parser;
use recmap::prelude::*;

#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
/// Path to the sequence lengths file
#[clap(long, value_parser)]
seqlens: String,

/// Path to the HapMap recombination map file
#[clap(value_parser)]
hapmap: String,
}

fn main() -> Result<(), RecMapError> {
let args = Args::parse();
let seqlens = read_seqlens(&args.seqlens)?;
let rec_map = RecMap::from_hapmap(&args.hapmap, seqlens)?;

for (name, rate_map) in rec_map.iter() {
println!("{}\t{}", name, rate_map.total_map_length().unwrap());
}

Ok(())
}
145 changes: 145 additions & 0 deletions src/file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
//! Encapsulates plaintext and gzip-compressed file input and output.
//!
//! The [`InputFile`] and [`OutputFile`] abstractions are for working with
//! possible gzip-compressed TSV files.
//!
use flate2::read::GzDecoder;
use flate2::write::GzEncoder;
use flate2::Compression;
use std::fs::File;
use std::io::Write;
use std::io::{self, BufWriter};
use std::io::{BufRead, BufReader, Read};
use thiserror::Error;

#[derive(Error, Debug)]
pub enum FileError {
#[error("IO error: {0}")]
IOError(#[from] io::Error),
}

/// Check if a file is a gzipped by looking for the magic numbers
fn is_gzipped_file(file_path: &str) -> io::Result<bool> {
let mut file = File::open(file_path)?;
let mut buffer = [0; 2];
file.read_exact(&mut buffer)?;

Ok(buffer == [0x1f, 0x8b])
}

/// Represents an input file.
///
/// This struct is used to handle operations on an input file, such as reading from the file.
/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input
/// to be read through a common interface.
pub struct InputFile {
pub filepath: String,
}

impl InputFile {
/// Constructs a new `InputFile`.
///
/// # Arguments
///
/// * `filepath` - A string slice that holds the path to the file. If the file extension is
/// `.gz`, `InputFile` will automatically uncompress the input.
pub fn new(filepath: &str) -> Self {
Self {
filepath: filepath.to_string(),
}
}

/// Opens the file and returns a buffered reader.
///
/// If the file is gzip-compressed (indicated by a ".gz" extension), this method will
/// automatically handle the decompression.
///
/// # Returns
///
/// A result containing a `BufReader<Box<dyn Read>>` on success, or a `FileError` on failure.
///
pub fn reader(&self) -> Result<BufReader<Box<dyn Read>>, FileError> {
let file = File::open(self.filepath.clone())?;
//let is_gzipped_name = self.filepath.ends_with(".gz");
let is_gzipped = is_gzipped_file(&self.filepath)?;
let reader: Box<dyn Read> = if is_gzipped {
Box::new(GzDecoder::new(file))
} else {
Box::new(file)
};
Ok(BufReader::new(reader))
}

/// Checks if the first line of the file matches the expected header.
///
/// # Arguments
///
/// * `expect` - A string slice representing the expected header.
///
/// # Returns
///
/// A result containing a boolean, `true` if the header matches, `false` otherwise,
/// or a `FileError` on failure.
///
pub fn has_header(&self, expect: &str) -> Result<bool, FileError> {
let mut buf_reader = self.reader()?;
let mut first_line = String::new();
buf_reader.read_line(&mut first_line)?;
let has_header = first_line.starts_with(expect);
Ok(has_header)
}
}

/// Represents an output file.
///
/// This struct is used to handle operations on an output file, such as writing to the file.
/// This abstracts writing both plaintext and gzip-compressed files.
pub struct OutputFile {
pub filepath: String,
pub header: Option<Vec<String>>,
}

impl OutputFile {
/// Constructs a new `OutputFile`.
///
/// # Arguments
///
/// * `filepath` - A string slice that holds the path to the file. If the file extension is
/// `.gz`, `OutputFile` will automatically write gzip-compressed output.
/// * `header` - An optional vector of strings representing commented header lines to be written to the file.
pub fn new(filepath: &str, header: Option<Vec<String>>) -> Self {
Self {
filepath: filepath.to_string(),
header,
}
}

/// Opens the file and returns a writer.
///
/// If the file path ends with ".gz", the file is treated as gzip-compressed, and the
/// function will handle compression automatically. If a header is set, it will be written
/// to the file.
///
/// # Returns
///
/// A result containing a `Box<dyn Write>` on success, or an `io::Error` on failure.
pub fn writer(&self) -> Result<Box<dyn Write>, io::Error> {
let outfile = &self.filepath;
let is_gzip = outfile.ends_with(".gz");
let mut writer: Box<dyn Write> = if is_gzip {
Box::new(BufWriter::new(GzEncoder::new(
File::create(outfile)?,
Compression::default(),
)))
} else {
Box::new(BufWriter::new(File::create(outfile)?))
};
// write header if one is set
if let Some(entries) = &self.header {
for entry in entries {
writeln!(writer, "#{}", entry)?;
}
}
Ok(writer)
}
}
11 changes: 11 additions & 0 deletions src/genome.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
use indexmap::IndexMap;

use super::recmap::RecMap;

pub type GenomeCoord = i32;

pub struct Genome {
pub seqlens: IndexMap<String, GenomeCoord>,
pub recmap: RecMap,
}

53 changes: 53 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//! Functionality for reading and working with recombination maps.
//!
//! [`RecMap`] objects can be created from reading in a HapMap-formatted
//! recombination map. Note that since the HapMap recombination format does
//! not include the chromosome lengths, this must be specified too.
//! A convenience function [`read_seqlens`] is provided to read in TSV-formatted
//! "genome" files of the chromosome names and lengths.
//!
//! Here is a example which loads a recombination map from a HapMap-formatted
//! recombination map and calculates the total map lengths.
//!
//! ```no_run
//! use recmap::prelude::*;
//! let seqlens = read_seqlens("hg38_seqlens.tsv")
//! .expect("could not read seqlens");
//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens)
//! .expect("cannot read hapmap");
//!
//! for (name, rate_map) in rec_map.iter() {
//! println!("{}\t{}", name, rate_map.total_map_length().unwrap());
//! }
//! ```
//!
//! This example can be run on the command line with:
//!
//! ```bash
//! cargo run --example calc_map_lengths -- --seqlens hg38_seqlens.tsv decode_2019_map.txt
//! ```
//!
//! ```no_run
//! use recmap::prelude::*;
//! let seqlens = read_seqlens("hg38_seqlens.tsv")
//! .expect("could not read seqlens");
//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens)
//! .expect("cannot read hapmap");
//!
//! let positions = vec![11975064, 15007450];
//! rec_map.interpolate_map_positions("chr1", &positions);
//!
//! ```

mod file;
mod numeric;
pub mod recmap;

pub use recmap::{read_seqlens, RecMap, RecMapError};

pub mod prelude {
pub use crate::recmap::{read_seqlens, RecMap, RecMapError};
}

#[cfg(test)]
mod tests {}
Loading

0 comments on commit 225d7eb

Please sign in to comment.