From 225d7ebcc60dbfc7ef8606caa4e08746699dc5ec Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Thu, 8 Feb 2024 00:26:59 -0800 Subject: [PATCH] added doc --- .gitignore | 1 + Cargo.toml | 20 ++ README.md | 41 ++++ examples/calc_map_lengths.rs | 26 +++ src/file.rs | 145 ++++++++++++ src/genome.rs | 11 + src/lib.rs | 53 +++++ src/numeric.rs | 174 ++++++++++++++ src/recmap.rs | 431 +++++++++++++++++++++++++++++++++++ 9 files changed, 902 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 examples/calc_map_lengths.rs create mode 100644 src/file.rs create mode 100644 src/genome.rs create mode 100644 src/lib.rs create mode 100644 src/numeric.rs create mode 100644 src/recmap.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..421ecf8 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "recmap" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +csv = "1.3.0" +flate2 = "1.0.28" +genomap = "0.1.3" +indexmap = "2.2.2" +ndarray = "0.15.6" +num-traits = "0.2.17" +serde = { version = "1.0.196", features = ["derive"] } +thiserror = "1.0.56" + +[dev-dependencies] +clap = { version = "4.4.18", features = ["derive"] } +tempfile = "3.10.0" diff --git a/README.md b/README.md new file mode 100644 index 0000000..3d6524d --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# RecMap Library for reading and working with recombination maps in Rust + +`RecMap` objects can be created from reading in a HapMap-formatted +recombination map. Note that since the HapMap recombination format does +not include the chromosome lengths, this must be specified too. +A convenience function `read_seqlens` is provided to read in TSV-formatted +"genome" files of the chromosome names and lengths. + +Here is a example which loads a recombination map from a HapMap-formatted +recombination map and calculates the total map lengths. + +```rust +use recmap::prelude::*; +let seqlens = read_seqlens("hg38_seqlens.tsv") + .expect("could not read seqlens"); +let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens) + .expect("cannot read hapmap"); + +for (name, rate_map) in rec_map.iter() { + println!("{}\t{}", name, rate_map.total_map_length().unwrap()); +} +``` + +This example can be run on the command line with: + +```bash +cargo run --example calc_map_lengths -- --seqlens hg38_seqlens.tsv decode_2019_map.txt +``` + +```rust +use recmap::prelude::*; +let seqlens = read_seqlens("hg38_seqlens.tsv") + .expect("could not read seqlens"); +let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens) + .expect("cannot read hapmap"); + +let positions = vec![11975064, 15007450]; +rec_map.interpolate_map_positions("chr1", &positions); + +``` + diff --git a/examples/calc_map_lengths.rs b/examples/calc_map_lengths.rs new file mode 100644 index 0000000..646b53d --- /dev/null +++ b/examples/calc_map_lengths.rs @@ -0,0 +1,26 @@ +use clap::Parser; +use recmap::prelude::*; + +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct Args { + /// Path to the sequence lengths file + #[clap(long, value_parser)] + seqlens: String, + + /// Path to the HapMap recombination map file + #[clap(value_parser)] + hapmap: String, +} + +fn main() -> Result<(), RecMapError> { + let args = Args::parse(); + let seqlens = read_seqlens(&args.seqlens)?; + let rec_map = RecMap::from_hapmap(&args.hapmap, seqlens)?; + + for (name, rate_map) in rec_map.iter() { + println!("{}\t{}", name, rate_map.total_map_length().unwrap()); + } + + Ok(()) +} diff --git a/src/file.rs b/src/file.rs new file mode 100644 index 0000000..5e94c14 --- /dev/null +++ b/src/file.rs @@ -0,0 +1,145 @@ +//! Encapsulates plaintext and gzip-compressed file input and output. +//! +//! The [`InputFile`] and [`OutputFile`] abstractions are for working with +//! possible gzip-compressed TSV files. +//! +use flate2::read::GzDecoder; +use flate2::write::GzEncoder; +use flate2::Compression; +use std::fs::File; +use std::io::Write; +use std::io::{self, BufWriter}; +use std::io::{BufRead, BufReader, Read}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum FileError { + #[error("IO error: {0}")] + IOError(#[from] io::Error), +} + +/// Check if a file is a gzipped by looking for the magic numbers +fn is_gzipped_file(file_path: &str) -> io::Result { + let mut file = File::open(file_path)?; + let mut buffer = [0; 2]; + file.read_exact(&mut buffer)?; + + Ok(buffer == [0x1f, 0x8b]) +} + +/// Represents an input file. +/// +/// This struct is used to handle operations on an input file, such as reading from the file. +/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input +/// to be read through a common interface. +pub struct InputFile { + pub filepath: String, +} + +impl InputFile { + /// Constructs a new `InputFile`. + /// + /// # Arguments + /// + /// * `filepath` - A string slice that holds the path to the file. If the file extension is + /// `.gz`, `InputFile` will automatically uncompress the input. + pub fn new(filepath: &str) -> Self { + Self { + filepath: filepath.to_string(), + } + } + + /// Opens the file and returns a buffered reader. + /// + /// If the file is gzip-compressed (indicated by a ".gz" extension), this method will + /// automatically handle the decompression. + /// + /// # Returns + /// + /// A result containing a `BufReader>` on success, or a `FileError` on failure. + /// + pub fn reader(&self) -> Result>, FileError> { + let file = File::open(self.filepath.clone())?; + //let is_gzipped_name = self.filepath.ends_with(".gz"); + let is_gzipped = is_gzipped_file(&self.filepath)?; + let reader: Box = if is_gzipped { + Box::new(GzDecoder::new(file)) + } else { + Box::new(file) + }; + Ok(BufReader::new(reader)) + } + + /// Checks if the first line of the file matches the expected header. + /// + /// # Arguments + /// + /// * `expect` - A string slice representing the expected header. + /// + /// # Returns + /// + /// A result containing a boolean, `true` if the header matches, `false` otherwise, + /// or a `FileError` on failure. + /// + pub fn has_header(&self, expect: &str) -> Result { + let mut buf_reader = self.reader()?; + let mut first_line = String::new(); + buf_reader.read_line(&mut first_line)?; + let has_header = first_line.starts_with(expect); + Ok(has_header) + } +} + +/// Represents an output file. +/// +/// This struct is used to handle operations on an output file, such as writing to the file. +/// This abstracts writing both plaintext and gzip-compressed files. +pub struct OutputFile { + pub filepath: String, + pub header: Option>, +} + +impl OutputFile { + /// Constructs a new `OutputFile`. + /// + /// # Arguments + /// + /// * `filepath` - A string slice that holds the path to the file. If the file extension is + /// `.gz`, `OutputFile` will automatically write gzip-compressed output. + /// * `header` - An optional vector of strings representing commented header lines to be written to the file. + pub fn new(filepath: &str, header: Option>) -> Self { + Self { + filepath: filepath.to_string(), + header, + } + } + + /// Opens the file and returns a writer. + /// + /// If the file path ends with ".gz", the file is treated as gzip-compressed, and the + /// function will handle compression automatically. If a header is set, it will be written + /// to the file. + /// + /// # Returns + /// + /// A result containing a `Box` on success, or an `io::Error` on failure. + pub fn writer(&self) -> Result, io::Error> { + let outfile = &self.filepath; + let is_gzip = outfile.ends_with(".gz"); + let mut writer: Box = if is_gzip { + Box::new(BufWriter::new(GzEncoder::new( + File::create(outfile)?, + Compression::default(), + ))) + } else { + Box::new(BufWriter::new(File::create(outfile)?)) + }; + // write header if one is set + if let Some(entries) = &self.header { + for entry in entries { + writeln!(writer, "#{}", entry)?; + } + } + Ok(writer) + } +} diff --git a/src/genome.rs b/src/genome.rs new file mode 100644 index 0000000..675fe4a --- /dev/null +++ b/src/genome.rs @@ -0,0 +1,11 @@ +use indexmap::IndexMap; + +use super::recmap::RecMap; + +pub type GenomeCoord = i32; + +pub struct Genome { + pub seqlens: IndexMap, + pub recmap: RecMap, +} + diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..f54a6bf --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,53 @@ +//! Functionality for reading and working with recombination maps. +//! +//! [`RecMap`] objects can be created from reading in a HapMap-formatted +//! recombination map. Note that since the HapMap recombination format does +//! not include the chromosome lengths, this must be specified too. +//! A convenience function [`read_seqlens`] is provided to read in TSV-formatted +//! "genome" files of the chromosome names and lengths. +//! +//! Here is a example which loads a recombination map from a HapMap-formatted +//! recombination map and calculates the total map lengths. +//! +//! ```no_run +//! use recmap::prelude::*; +//! let seqlens = read_seqlens("hg38_seqlens.tsv") +//! .expect("could not read seqlens"); +//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens) +//! .expect("cannot read hapmap"); +//! +//! for (name, rate_map) in rec_map.iter() { +//! println!("{}\t{}", name, rate_map.total_map_length().unwrap()); +//! } +//! ``` +//! +//! This example can be run on the command line with: +//! +//! ```bash +//! cargo run --example calc_map_lengths -- --seqlens hg38_seqlens.tsv decode_2019_map.txt +//! ``` +//! +//! ```no_run +//! use recmap::prelude::*; +//! let seqlens = read_seqlens("hg38_seqlens.tsv") +//! .expect("could not read seqlens"); +//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens) +//! .expect("cannot read hapmap"); +//! +//! let positions = vec![11975064, 15007450]; +//! rec_map.interpolate_map_positions("chr1", &positions); +//! +//! ``` + +mod file; +mod numeric; +pub mod recmap; + +pub use recmap::{read_seqlens, RecMap, RecMapError}; + +pub mod prelude { + pub use crate::recmap::{read_seqlens, RecMap, RecMapError}; +} + +#[cfg(test)] +mod tests {} diff --git a/src/numeric.rs b/src/numeric.rs new file mode 100644 index 0000000..88cd58a --- /dev/null +++ b/src/numeric.rs @@ -0,0 +1,174 @@ +use ndarray::Array1; +use num_traits::{cast::ToPrimitive, Float, NumCast}; +use std::{ + cmp::Ordering, + fmt::{Debug, Display}, +}; + +/// Assert to float iterables are the same up to `eps`. +#[allow(dead_code)] +pub fn assert_float_eq(left: T, right: T, eps: T) +where + T: Float + Display, +{ + if left.is_nan() { + assert!(right.is_nan(), "left is NaN, but right is not"); + } else { + let diff = (left - right).abs(); + assert!( + diff < eps, + "values |{} - {}| ≥ {} (diff: {})", + left, + right, + eps, + diff + ); + } +} + +/// Assert to float values are the same up to `eps`. +#[allow(dead_code)] +pub fn assert_floats_eq(left: &[f64], right: &[f64], eps: f64) { + assert_eq!(left.len(), right.len()); + for (l, r) in left.iter().zip(right.iter()) { + assert_float_eq(*l, *r, eps) + } +} + +#[allow(dead_code)] +pub fn absolute_differences(a: &Array1, b: &Array1) -> Array1 +where + T: Float, +{ + a.iter() + .zip(b.iter()) + .map(|(&a_i, &b_i)| (a_i - b_i).abs()) + .collect::>() +} + +// Generic Euclidean distance function +#[allow(dead_code)] +pub fn euclidean_distance(x: T, y: T) -> T { + (x - y).abs() +} + +// Inverse Haldane's mapping function +// +// Brings map distances to recombination fractions +#[allow(dead_code)] +fn haldane_inverse(map_distance: T) -> T { + T::from(0.5).unwrap() * (T::one() - (T::from(-2.0).unwrap() * map_distance).exp()) +} + +#[derive(Debug, PartialEq)] +pub enum SearchResult { + Exact(usize), + LowerBound(usize), + UpperBound(usize), + LeftOf(usize), +} + +impl SearchResult { + #[allow(dead_code)] + pub fn get_index(&self) -> usize { + match self { + SearchResult::Exact(idx) => *idx, + SearchResult::LeftOf(idx) => *idx, + SearchResult::LowerBound(idx) => *idx, + SearchResult::UpperBound(idx) => *idx, + } + } +} + +pub fn search_sorted(vec: &[T], new_val: T) -> SearchResult { + let mut left = 0; + let mut right = vec.len(); + while left < right { + let mid = left + (right - left) / 2; + + match vec[mid].partial_cmp(&new_val).unwrap() { + Ordering::Less => left = mid + 1, + Ordering::Greater => right = mid, + Ordering::Equal => return SearchResult::Exact(mid), + } + } + + if left == 0 { + SearchResult::LowerBound(left) + } else if left < vec.len() { + SearchResult::LeftOf(left) + } else { + SearchResult::UpperBound(left) + } +} + +pub fn interp1d(x: &[Tx], y: &[Ty], x0: Tx) -> Option +where + Tx: PartialOrd + ToPrimitive + Copy + Debug, + Ty: ToPrimitive + NumCast + Copy + Debug, +{ + assert!(x.len() == y.len()); + let index = search_sorted(x, x0); + match index { + SearchResult::Exact(idx) => Some(y[idx]), + SearchResult::LeftOf(idx) => { + if idx == 0 || idx >= x.len() { + return None; + } + + let x1 = ToPrimitive::to_f64(&x[idx - 1])?; + let x2 = ToPrimitive::to_f64(&x[idx])?; + let y1 = ToPrimitive::to_f64(&y[idx - 1])?; + let y2 = ToPrimitive::to_f64(&y[idx])?; + let x0 = ToPrimitive::to_f64(&x0)?; + + // Perform linear interpolation + let y0 = y1 + (y2 - y1) * (x0 - x1) / (x2 - x1); + + NumCast::from(y0) + } + SearchResult::LowerBound(_) => Some(y[0]), + SearchResult::UpperBound(idx) => Some(y[idx - 1]), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_search_sorted_empty() { + let vec: Vec = vec![]; + assert_eq!(search_sorted(&vec, 5), SearchResult::LowerBound(0)); + } + + #[test] + fn test_search_sorted_exact_match() { + let vec = vec![1, 2, 3, 4, 5]; + assert_eq!(search_sorted(&vec, 3), SearchResult::Exact(2)); + } + + #[test] + fn test_search_sorted_no_exact_match_left_of() { + let vec = vec![1, 3, 5, 7, 9]; + assert_eq!(search_sorted(&vec, 4), SearchResult::LeftOf(2)); + } + + #[test] + fn test_search_sorted_no_exact_match_lower_bound() { + let vec = vec![10, 20, 30, 40, 50]; + assert_eq!(search_sorted(&vec, 5), SearchResult::LowerBound(0)); + } + + #[test] + fn test_search_sorted_no_exact_match_upper_bound() { + let vec = vec![10, 20, 30, 40, 50]; + assert_eq!(search_sorted(&vec, 55), SearchResult::UpperBound(5)); + } + + #[test] + fn test_search_sorted_with_floats() { + let vec = vec![1.0, 2.5, 4., 4.8, 5.9]; + assert_eq!(search_sorted(&vec, 3.5), SearchResult::LeftOf(2)); + } +} diff --git a/src/recmap.rs b/src/recmap.rs new file mode 100644 index 0000000..191879b --- /dev/null +++ b/src/recmap.rs @@ -0,0 +1,431 @@ +use csv::ReaderBuilder; +use genomap::{GenomeMap, GenomeMapError}; +use indexmap::map::IndexMap; +use ndarray::Array1; +use serde::{Deserialize, Serialize}; +use std::io; +use std::io::Write; +use thiserror::Error; + +use crate::file::OutputFile; + +use super::file::{FileError, InputFile}; +use super::numeric::interp1d; + +/// The float type for recombination rates. +pub type RateFloat = f32; + +/// The integer type for genomic positions. +/// +/// # Developer Notes +/// In the future, applications needing support for chromosomes longer than +/// `i32::MAX` could change this through a `--feature`. +pub type Position = u64; + +const CM_MB_CONVERSION: RateFloat = 1e-8; + +#[derive(Error, Debug)] +pub enum RecMapError { + #[error("HapMap parsing error: {0}")] + HapMapParsingError(#[from] csv::Error), + #[error("IO error: {0}")] + IOError(#[from] io::Error), + #[error("File reading eror: {0}")] + FileError(#[from] FileError), + #[error("Missing field")] + MissingField, + #[error("Failed to parse a column of a HapMap file")] + ParseError(String), + #[error("Improper Rate value, either NaN or negative ({0})")] + ImproperRate(String), + #[error("Chromosome key '{0}' does not exist")] + NoChrom(String), + #[error("HapMap file not sorted")] + HapMapNotSorted, + #[error("Lookup out of bounds ({0}:{1})")] + LookupOutOfBounds(String, Position), + #[error("Internal Error")] + InternalError(String), + #[error("GenomeMap Error: error updating GenomeMap")] + GenomeMapError(#[from] GenomeMapError), +} + +/// Read a tab-delimited *genome file* of sequence (i.e. chromosome) names and their lengths. +pub fn read_seqlens(filepath: &str) -> Result, csv::Error> { + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_path(filepath)?; + + let mut seqlens = IndexMap::new(); + + #[derive(Debug, Serialize, Deserialize, Default)] + struct SeqLenEntry { + chrom: String, + length: Position, + } + + for result in rdr.deserialize() { + let record: SeqLenEntry = result?; + seqlens.insert(record.chrom, record.length); + } + + Ok(seqlens) +} + +/// Storage and methods for a single chromosome's recombination rates and marker positions. +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct RateMap { + /// The n+1 genomic positions of the markers, 0-indexed and ending at the sequence length. + pub ends: Vec, + /// The n rates in Morgans, between each genomic position. + pub rates: Vec, + /// The n+1 cumulative map lengths at each genomic position. + pub map_pos: Vec, +} + +impl RateMap { + /// Create a new recombination rate map for a single chromosome. + pub fn new() -> Self { + Self { + ends: Vec::new(), + rates: Vec::new(), + map_pos: Vec::new(), + } + } + + /// Returns the spans (i.e. widths) in basepairs between each marker. + pub fn span(&self) -> Vec { + self.ends.windows(2).map(|pair| pair[1] - pair[0]).collect() + } + + /// Returns the cumulative mass between each marker. + pub fn mass(&self) -> Vec { + self.rates + .iter() + .zip(self.span().iter()) + .map(|(&x, &y)| x * y as RateFloat) + .collect() + } + + /// Calculate the cumulative map length at each position. + pub fn calc_cumulative_mass(&mut self) { + let mass = self.mass(); + //println!("mass: {:?}", mass); + let cumulative_sum: Vec<_> = mass + .iter() + .scan(0.0, |state, &x| { + *state += x; + Some(*state) + }) + .collect(); + self.map_pos = cumulative_sum; + } + + /// Calculate the total map length + pub fn total_map_length(&self) -> Option<&RateFloat> { + self.map_pos.last() + } +} + +pub struct RecMap { + pub map: GenomeMap, +} + +impl RecMap { + /// Create a new [`RecMap`] from a HapMap-formatted recombination map file. + /// + /// This method also supports reading directly from a gzip-compressed file. + /// + /// The HapMap recombination format looks like: + /// + /// ```text + /// Chromosome Position(bp) Rate(cM/Mb) Map(cM) + /// chr1 55550 2.981822 0.000000 + /// chr1 82571 2.082414 0.080572 + /// chr1 88169 2.081358 0.092229 + /// chr1 254996 3.354927 0.439456 + /// chr1 564598 2.887498 1.478148 + /// chr1 564621 2.885864 1.478214 + /// chr1 565433 2.883892 1.480558 + /// chr1 568322 2.887570 1.488889 + /// chr1 568527 2.895420 1.489481 + /// ``` + /// + /// The HapMap format is well-described in the [tskit documentation](https://tskit.dev/msprime/docs/stable/api.html#msprime.RateMap.read_hapmap). + /// + /// This parser is a bit more permissive -- it will allow no headers. + /// + pub fn from_hapmap( + filepath: &str, + seqlens: IndexMap, + ) -> Result { + let input_file = InputFile::new(filepath); + + // read one line to check for headers + let has_header = input_file.has_header("Chromosome")?; + + let buf_reader = input_file.reader()?; + + let mut rdr = ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(has_header) + .from_reader(buf_reader); + + let mut rec_map: GenomeMap = GenomeMap::new(); + + let mut last_chrom: Option = None; + let mut last_end: Option = None; + + for result in rdr.records() { + let record = result.map_err(RecMapError::HapMapParsingError)?; + + // remove comment lines (TODO could store in metadata) + if record.get(0).map_or(false, |s| s.starts_with('#')) { + continue; + } + + // get the chrom column + let chrom = record.get(0).ok_or(RecMapError::MissingField)?.to_string(); + + // Our parser will always add on the chromosome end. Most times the + // HapMap file will *not* have this, but we still check. + match last_chrom { + None => {} + Some(last) => { + if chrom != last { + // We're on a new chromosome. Insert the last entry into rec_map + if let Some(seq_len) = seqlens.get(&last) { + let chrom_entry = rec_map.entry_or_default(&last); + + // Assuming the rate is 0 for these extra entries + if let Some(&last_end) = chrom_entry.ends.last() { + if last_end != *seq_len { + chrom_entry.ends.push(*seq_len); + chrom_entry.rates.push(0.0); + } + } + } + // reset the end too + last_end = None; + } + } + } + + // Update last_chrom and last_end + last_chrom = Some(chrom.clone()); + + // get the position and rate column, parsing into proper numeric types + let end_str = record.get(1).ok_or(RecMapError::MissingField)?; + let end: Position = end_str.parse().map_err(|_| { + dbg!(&end_str); + RecMapError::ParseError(format!("Failed to parse end from string: {}", end_str)) + })?; + + // Check that everything is sorted + match last_end { + None => Some(end), + Some(last_end) => { + if end >= last_end { + return Err(RecMapError::HapMapNotSorted); + } + Some(end) + } + }; + + let rate_str = record.get(2).ok_or(RecMapError::MissingField)?; + let rate: RateFloat = rate_str.parse().map_err(|_| { + RecMapError::ParseError(format!("Failed to parse rate from string: {}", rate_str)) + })?; + + // check rate isn't NaN or negative + if rate.is_nan() || rate < 0.0 { + return Err(RecMapError::ImproperRate(format!("{}:{}", chrom, end))); + } + + // HapMap rates are *always* in cM/Mb, but the natural unit is Morgans, so + // we convert here. + let rate = CM_MB_CONVERSION * rate; + + // Insert into GenomeMap, making a new one for this chromosome if needed. + // Note that we will also pad the first entry so that it's position zero, with rate + // zero. + if let Some(chrom_entry) = rec_map.get_mut(&chrom) { + chrom_entry.ends.push(end); + chrom_entry.rates.push(rate); + } else { + let mut new_rate_map = RateMap::new(); + + // If doesn't start with zero, add zero. + if end != 0 { + new_rate_map.ends.push(0); + new_rate_map.rates.push(0.0); + } + + // then add the current entry too + new_rate_map.ends.push(end); + new_rate_map.rates.push(rate); + + rec_map.insert(&chrom, new_rate_map)?; + } + } + + // Insert the final entry for the last chromosome outside the loop if needed + if let Some(last) = last_chrom { + if let Some(seq_len) = seqlens.get(&last) { + let chrom_entry = rec_map.entry_or_default(&last); + + // Assuming the rate is 0 for these extra entries + if chrom_entry.ends.is_empty() || chrom_entry.ends.last().unwrap() != seq_len { + chrom_entry.ends.push(*seq_len); + } + } + } + + let mut rec_map = RecMap { map: rec_map }; + rec_map.generate_map_positions(); + Ok(rec_map) + } + + /// Return the number of chromosomes in the recombination map. + pub fn len(&self) -> usize { + self.map.len() + } + + /// Return if the recombination map is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Iterate over chromosome name and [`RateMap`] tuples. + pub fn iter(&self) -> impl Iterator { + self.map.iter() + } + + /// Generate the cumulative map positions via the marginal recombination rates. + fn generate_map_positions(&mut self) { + for rate_map in self.map.values_mut() { + rate_map.calc_cumulative_mass(); + } + } + + /// Interpolate the recombination map position at the specified physical position. + /// This uses linear interpolation. + /// + /// # Arguments + /// * `name`: the chromosome name. + /// * `position`: the physical position to estimate the recombination map position at. + pub fn interpolate_map_position( + &self, + name: &str, + position: Position, + ) -> Result { + let rate_map = self + .map + .get(name) + .ok_or(RecMapError::NoChrom(name.to_string()))?; + let ends = &rate_map.ends; + let interp_result = interp1d(&ends[0..ends.len() - 1], &rate_map.map_pos, position); + let interpolated_map_pos = + interp_result.ok_or(RecMapError::LookupOutOfBounds(name.to_string(), position))?; + Ok(interpolated_map_pos) + } + + /// Interpolate the recombination map position at the specified physical positions. + /// This uses linear interpolation. + /// + /// # Arguments + /// * `name`: the chromosome name. + /// * `position`: the physical position to estimate the recombination map position at. + pub fn interpolate_map_positions( + &self, + chrom: &str, + positions: &[Position], + ) -> Result, RecMapError> { + let positions: Vec = positions + .iter() + .map(|p| self.interpolate_map_position(chrom, *p)) + .collect::, _>>()?; + Ok(Array1::from_vec(positions)) + } + + /// Write recombination map to a BED-like TSV file. + /// + /// # Arguments + /// * `filepath`: The filepath to write the recombination map to. If the filepath + /// has an `.gz` extension, the output will be gzip compressed. + /// If `filepath` is `None`, uncompressed output will be written to standard out. + pub fn write_tsv(&self, filepath: Option<&str>) -> Result<(), RecMapError> { + let precision = 8; + let mut writer: Box = match filepath { + Some(path) => { + let file = OutputFile::new(path, None); + file.writer()? + } + None => { + // Use stdout if filepath is None + Box::new(std::io::stdout()) + } + }; + + for (chrom, rate_map) in self.map.iter() { + // get the (start, end) ranges from the end points. + let ranges: Vec<(Position, Position)> = rate_map + .ends + .windows(2) + .map(|pair| (pair[0], pair[1])) + .collect(); + + // write the rows + for (i, range) in ranges.iter().enumerate() { + let rate = rate_map.rates[i]; + + // Write the record to the file + let rate_rescaled: RateFloat = rate / CM_MB_CONVERSION; + + let formatted_rate = format!("{:.1$e}", rate_rescaled, precision); + + // Write the record to the file + writeln!( + writer, + "{}\t{}\t{}\t{}", + chrom, range.0, range.1, formatted_rate + )?; + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::RecMap; + use tempfile::tempdir; + + fn read_hapmap() -> RecMap { + let seqlens = indexmap::indexmap! { + "chr1".to_string() => 6980669, + "chr2".to_string() => 6004443, + "chr3".to_string() => 6026894, + }; + let rec_map = RecMap::from_hapmap("tests/data/decode_2010_test_map.txt", seqlens).unwrap(); + + let dir = tempdir().unwrap(); + let output_path = dir.path().join("output.tsv"); + + rec_map + .write_tsv(Some(output_path.to_str().unwrap())) + .unwrap(); + + dir.close().unwrap(); + rec_map + } + + #[test] + fn test_hapmap_read() { + let rm = read_hapmap(); + assert_eq!(rm.len(), 3); + assert!(!rm.is_empty()); + } +}