diff --git a/Cargo.lock b/Cargo.lock index 14d80c3..49d33b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -329,7 +329,7 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "recmap" -version = "0.2.4" +version = "0.3.4" dependencies = [ "clap", "csv", diff --git a/Cargo.toml b/Cargo.toml index 6d05553..d29322a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "recmap" -version = "0.2.4" +version = "0.3.4" edition = "2021" license = "MIT" authors = ["Vince Buffalo "] @@ -16,6 +16,7 @@ path = "src/lib.rs" [features] cli = [ "clap" ] +big-position = [] [[bin]] name = "recmap" diff --git a/src/numeric.rs b/src/numeric.rs index 503deba..4ec9b5c 100644 --- a/src/numeric.rs +++ b/src/numeric.rs @@ -1,4 +1,4 @@ -use ndarray::Array1; +use ndarray::{Array1, Array2}; use num_traits::{cast::ToPrimitive, Float, NumCast}; use std::{ cmp::Ordering, @@ -63,6 +63,45 @@ fn haldane_inverse(map_distance: T) -> T { T::from(0.5).unwrap() * (T::one() - (T::from(-2.0).unwrap() * map_distance).exp()) } +/// Build the pairwise recombination distance matrix, using some float-like type `T`. +/// +/// Creates a `positions_x.len() x positions_y.len()` matrix of recombination +/// *distances* (in Morgans), for the supplied set of positions on the physical +/// map. +/// +/// # Arguments +/// * `positions_x`: an [`ArrayView1`] of the first set of marker positions. +/// * `positions_y`: an [`ArrayView1`] of the second set of marker positions. +/// * `haldane`: whether to convert the recombination distances in *Morgans* to a +/// unit-less recombination *fraction*. +/// * `rec_floor`: an optional *floor* value; all elements in the matrix less than +/// this value will be set to this value. This is sometimes useful in downstream +/// processing when zero values create problems. +/// +pub fn recomb_dist_matrix( + positions_x: &[T], + positions_y: &[T], + haldane: bool, + min_rec: Option, +) -> Array2 { + let mut dist_matrix = Array2::::zeros((positions_x.len(), positions_y.len())); + for (i, &pos_x) in positions_x.iter().enumerate() { + for (j, &pos_y) in positions_y.iter().enumerate() { + let dist = euclidean_distance(pos_x, pos_y); + let mut rf = if !haldane { + dist + } else { + haldane_inverse(dist) + }; + if let Some(min_rec_rate) = min_rec { + rf = rf.max(min_rec_rate); + } + dist_matrix[[i, j]] = rf; + } + } + dist_matrix +} + #[derive(Debug, PartialEq)] pub enum SearchResult { Exact(usize), diff --git a/src/recmap.rs b/src/recmap.rs index 780cfc1..95e69e4 100644 --- a/src/recmap.rs +++ b/src/recmap.rs @@ -1,5 +1,6 @@ use genomap::{GenomeMap, GenomeMapError}; use indexmap::map::IndexMap; +use ndarray::Array2; use num_traits::Float; use serde::{Deserialize, Serialize}; use std::fmt::Display; @@ -10,6 +11,7 @@ use std::io::Write; use thiserror::Error; use crate::file::OutputFile; +use crate::numeric::recomb_dist_matrix; use super::file::{FileError, InputFile}; use super::numeric::interp1d; @@ -17,13 +19,36 @@ use super::numeric::interp1d; /// The float type for recombination rates. pub type RateFloat = f64; -/// The integer type for genomic positions. +/// The main position type in `recmap`. /// -/// # Developer Notes -/// In the future, applications needing support for chromosomes longer than -/// `i32::MAX` could change this through a `--feature`. +/// This type is currently an unwrapped [`u32`]. This should handle +/// chromosome lengths for nearly all species. In fact, the only exception +/// known so far is lungfush (*Neoceratodus forsteri*), which has a chromosomes +/// that reaches 5.4Gb (). +/// The [`u32::MAX`] is 4,294,967,295, i.e. 4.29 Gigabases, which means [`u32`] is +/// just barely suitable for even the largest known chromosome. There is a +/// performance and memory-efficiency tradeoff when using [`u64`] over [`u32`], +/// so [`u32`] is used by default since it handles nearly all cases. +/// +/// # Feature support for large chromosomes +/// +/// If you are working with data from a species with unusually large chromosomes, +/// you can compile `recmap` using the `--features=big-position` option, which will set +/// the [`Position`] and [`PositionOffset`] to [`u64`] and [`i64`], respectively. +/// +/// [`u32::MAX`]: std::u32::MAX +#[cfg(not(feature = "big-position"))] +pub type Position = u32; +#[cfg(feature = "big-position")] pub type Position = u64; +/// The main *signed* position type in recmap, to represent offsets (e.g. +/// for adjust range coordinates, etc). +#[cfg(not(feature = "big-position"))] +pub type PositionOffset = i32; +#[cfg(feature = "big-position")] +pub type PositionOffset = i64; + pub const CM_MB_CONVERSION: RateFloat = 1e-8; pub const RATE_PRECISION: usize = 8; pub const SCI_NOTATION_THRESH: usize = 8; @@ -433,6 +458,35 @@ impl RecMap { Ok(positions) } + /// Build the pairwise recombination distance matrix for the specified chromosome. + /// + /// Creates a `positions_x.len() x positions_y.len()` matrix of recombination + /// *distances* (in Morgans), for the supplied set of positions on the physical + /// map. + /// + /// # Arguments + /// * `positions_x`: the first set of marker positions. + /// * `positions_y`: the second set of marker positions (just repeat `positions_x` for a + /// symmetric distance matrix). + /// * `haldane`: whether to convert the recombination distances in *Morgans* to a + /// unit-less recombination *fraction*. + /// * `rec_floor`: an optional *floor* value; all elements in the matrix less than + /// this value will be set to this value. This is sometimes useful in downstream + /// processing when zero values create problems. + /// + pub fn recomb_dist_matrix( + &self, + chrom: &str, + positions_x: &[Position], + positions_y: &[Position], + haldane: bool, + min_rec: Option, + ) -> Result, RecMapError> { + let x_pos = self.interpolate_map_positions(chrom, positions_x)?; + let y_pos = self.interpolate_map_positions(chrom, positions_y)?; + Ok(recomb_dist_matrix(&x_pos, &y_pos, haldane, min_rec)) + } + /// Write recombination map to HapMap-formatted file. /// /// This file has the usual HapMap recombination map header, and columns: