Skip to content

Commit

Permalink
Switched from Position being u64 to u32, with --feature=big-position …
Browse files Browse the repository at this point in the history
…for u64.

 - New position type for memory efficiency; matches what the GRanges library
   does.
 - New `recomb_dist_matrix` method, which builds a recombination distance
   matrix, optionally using Haldane's mapping function to convert distance in
   Morgans into recombination fraction.
  • Loading branch information
vsbuffalo committed Feb 28, 2024
1 parent 471ec62 commit 031b8b2
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "recmap"
version = "0.2.4"
version = "0.3.4"
edition = "2021"
license = "MIT"
authors = ["Vince Buffalo <[email protected]>"]
Expand All @@ -16,6 +16,7 @@ path = "src/lib.rs"

[features]
cli = [ "clap" ]
big-position = []

[[bin]]
name = "recmap"
Expand Down
41 changes: 40 additions & 1 deletion src/numeric.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use ndarray::Array1;
use ndarray::{Array1, Array2};
use num_traits::{cast::ToPrimitive, Float, NumCast};
use std::{
cmp::Ordering,
Expand Down Expand Up @@ -63,6 +63,45 @@ fn haldane_inverse<T: Float>(map_distance: T) -> T {
T::from(0.5).unwrap() * (T::one() - (T::from(-2.0).unwrap() * map_distance).exp())
}

/// Build the pairwise recombination distance matrix, using some float-like type `T`.
///
/// Creates a `positions_x.len() x positions_y.len()` matrix of recombination
/// *distances* (in Morgans), for the supplied set of positions on the physical
/// map.
///
/// # Arguments
/// * `positions_x`: an [`ArrayView1`] of the first set of marker positions.
/// * `positions_y`: an [`ArrayView1`] of the second set of marker positions.
/// * `haldane`: whether to convert the recombination distances in *Morgans* to a
/// unit-less recombination *fraction*.
/// * `rec_floor`: an optional *floor* value; all elements in the matrix less than
/// this value will be set to this value. This is sometimes useful in downstream
/// processing when zero values create problems.
///
pub fn recomb_dist_matrix<T: Float>(
positions_x: &[T],
positions_y: &[T],
haldane: bool,
min_rec: Option<T>,
) -> Array2<T> {
let mut dist_matrix = Array2::<T>::zeros((positions_x.len(), positions_y.len()));
for (i, &pos_x) in positions_x.iter().enumerate() {
for (j, &pos_y) in positions_y.iter().enumerate() {
let dist = euclidean_distance(pos_x, pos_y);
let mut rf = if !haldane {
dist
} else {
haldane_inverse(dist)
};
if let Some(min_rec_rate) = min_rec {
rf = rf.max(min_rec_rate);
}
dist_matrix[[i, j]] = rf;
}
}
dist_matrix
}

#[derive(Debug, PartialEq)]
pub enum SearchResult {
Exact(usize),
Expand Down
62 changes: 58 additions & 4 deletions src/recmap.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use genomap::{GenomeMap, GenomeMapError};
use indexmap::map::IndexMap;
use ndarray::Array2;
use num_traits::Float;
use serde::{Deserialize, Serialize};
use std::fmt::Display;
Expand All @@ -10,20 +11,44 @@ use std::io::Write;
use thiserror::Error;

use crate::file::OutputFile;
use crate::numeric::recomb_dist_matrix;

use super::file::{FileError, InputFile};
use super::numeric::interp1d;

/// The float type for recombination rates.
pub type RateFloat = f64;

/// The integer type for genomic positions.
/// The main position type in `recmap`.
///
/// # Developer Notes
/// In the future, applications needing support for chromosomes longer than
/// `i32::MAX` could change this through a `--feature`.
/// This type is currently an unwrapped [`u32`]. This should handle
/// chromosome lengths for nearly all species. In fact, the only exception
/// known so far is lungfush (*Neoceratodus forsteri*), which has a chromosomes
/// that reaches 5.4Gb (<https://www.nature.com/articles/s41586-021-03198-8l>).
/// The [`u32::MAX`] is 4,294,967,295, i.e. 4.29 Gigabases, which means [`u32`] is
/// just barely suitable for even the largest known chromosome. There is a
/// performance and memory-efficiency tradeoff when using [`u64`] over [`u32`],
/// so [`u32`] is used by default since it handles nearly all cases.
///
/// # Feature support for large chromosomes
///
/// If you are working with data from a species with unusually large chromosomes,
/// you can compile `recmap` using the `--features=big-position` option, which will set
/// the [`Position`] and [`PositionOffset`] to [`u64`] and [`i64`], respectively.
///
/// [`u32::MAX`]: std::u32::MAX
#[cfg(not(feature = "big-position"))]
pub type Position = u32;
#[cfg(feature = "big-position")]
pub type Position = u64;

/// The main *signed* position type in recmap, to represent offsets (e.g.
/// for adjust range coordinates, etc).
#[cfg(not(feature = "big-position"))]
pub type PositionOffset = i32;
#[cfg(feature = "big-position")]
pub type PositionOffset = i64;

pub const CM_MB_CONVERSION: RateFloat = 1e-8;
pub const RATE_PRECISION: usize = 8;
pub const SCI_NOTATION_THRESH: usize = 8;
Expand Down Expand Up @@ -433,6 +458,35 @@ impl RecMap {
Ok(positions)
}

/// Build the pairwise recombination distance matrix for the specified chromosome.
///
/// Creates a `positions_x.len() x positions_y.len()` matrix of recombination
/// *distances* (in Morgans), for the supplied set of positions on the physical
/// map.
///
/// # Arguments
/// * `positions_x`: the first set of marker positions.
/// * `positions_y`: the second set of marker positions (just repeat `positions_x` for a
/// symmetric distance matrix).
/// * `haldane`: whether to convert the recombination distances in *Morgans* to a
/// unit-less recombination *fraction*.
/// * `rec_floor`: an optional *floor* value; all elements in the matrix less than
/// this value will be set to this value. This is sometimes useful in downstream
/// processing when zero values create problems.
///
pub fn recomb_dist_matrix(
&self,
chrom: &str,
positions_x: &[Position],
positions_y: &[Position],
haldane: bool,
min_rec: Option<RateFloat>,
) -> Result<Array2<RateFloat>, RecMapError> {
let x_pos = self.interpolate_map_positions(chrom, positions_x)?;
let y_pos = self.interpolate_map_positions(chrom, positions_y)?;
Ok(recomb_dist_matrix(&x_pos, &y_pos, haldane, min_rec))
}

/// Write recombination map to HapMap-formatted file.
///
/// This file has the usual HapMap recombination map header, and columns:
Expand Down

0 comments on commit 031b8b2

Please sign in to comment.