From 2b0b7b49709c9373c5270c55a81866b93644fa81 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Thu, 8 Feb 2024 22:28:45 -0800 Subject: [PATCH] fixed bug, added more tests --- Cargo.toml | 2 +- README.md | 6 + examples/calc_map_lengths.rs | 2 +- src/file.rs | 64 +++++-- src/lib.rs | 14 +- src/numeric.rs | 5 +- src/recmap.rs | 311 +++++++++++++++++++++++++++++------ 7 files changed, 333 insertions(+), 71 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 86371da..4ba6c79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "recmap" -version = "0.1.1" +version = "0.2.1" edition = "2021" license = "MIT" authors = ["Vince Buffalo "] diff --git a/README.md b/README.md index f46196c..81b4cb0 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,12 @@ This example can be run on the command line with: cargo run --example calc_map_lengths -- --seqlens hg38_seqlens.tsv decode_2019_map.txt ``` +One of the most common tasks when working with recombination maps is to +estimate the map position of arbitrary markers, which is usually done by linear +interpolation. `RecMap` provides an easy way to do this for one position +(`RecMap.interpolate_map_position()`) and for many positions, with +`RecMap.interpolate_map_positions()`: + ```rust use recmap::prelude::*; let seqlens = read_seqlens("hg38_seqlens.tsv") diff --git a/examples/calc_map_lengths.rs b/examples/calc_map_lengths.rs index 646b53d..012c02d 100644 --- a/examples/calc_map_lengths.rs +++ b/examples/calc_map_lengths.rs @@ -16,7 +16,7 @@ struct Args { fn main() -> Result<(), RecMapError> { let args = Args::parse(); let seqlens = read_seqlens(&args.seqlens)?; - let rec_map = RecMap::from_hapmap(&args.hapmap, seqlens)?; + let rec_map = RecMap::from_hapmap(&args.hapmap, &seqlens)?; for (name, rate_map) in rec_map.iter() { println!("{}\t{}", name, rate_map.total_map_length().unwrap()); diff --git a/src/file.rs b/src/file.rs index 5e94c14..a7c7f70 100644 --- a/src/file.rs +++ b/src/file.rs @@ -34,6 +34,9 @@ fn is_gzipped_file(file_path: &str) -> io::Result { /// to be read through a common interface. pub struct InputFile { pub filepath: String, + pub comments: Option>, + pub header: Option, + pub skip_lines: usize, } impl InputFile { @@ -46,6 +49,9 @@ impl InputFile { pub fn new(filepath: &str) -> Self { Self { filepath: filepath.to_string(), + comments: None, + header: None, + skip_lines: 0, } } @@ -70,23 +76,49 @@ impl InputFile { Ok(BufReader::new(reader)) } - /// Checks if the first line of the file matches the expected header. - /// - /// # Arguments - /// - /// * `expect` - A string slice representing the expected header. - /// - /// # Returns - /// - /// A result containing a boolean, `true` if the header matches, `false` otherwise, - /// or a `FileError` on failure. - /// - pub fn has_header(&self, expect: &str) -> Result { + /// Collects comment lines and/or a line at the start of the file. + pub fn collect_metadata(&mut self, comment: &str, header: &str) -> Result { + let mut buf_reader = self.reader()?; + let mut comments = Vec::new(); + let mut line = String::new(); + + while buf_reader.read_line(&mut line)? > 0 { + if line.starts_with(comment) { + comments.push(line.trim_end().to_string()); + self.skip_lines += 1; + } else { + if line.starts_with(header) { + self.header = Some(line.trim_end().to_string()); + self.skip_lines += 1; + // We only handle one header line. If there are more, the + // file is *very* poorly formatted. So just let downstream + // parsing errors catch this. In the future, we could have a specialized + // error. + break; + } + // break on the first non-header/comment line + break; + } + line.clear(); + } + + self.comments = Some(comments); + Ok(self.skip_lines > 0) + } + + /// Method to continue reading after skipping the comment and header lines. + pub fn continue_reading(&self) -> Result>, FileError> { let mut buf_reader = self.reader()?; - let mut first_line = String::new(); - buf_reader.read_line(&mut first_line)?; - let has_header = first_line.starts_with(expect); - Ok(has_header) + let mut skipped_lines = 0; + let mut line = String::new(); + + // skip the lines that were previously read as comments or header + while skipped_lines < self.skip_lines { + buf_reader.read_line(&mut line)?; + skipped_lines += 1; + line.clear(); + } + Ok(buf_reader) } } diff --git a/src/lib.rs b/src/lib.rs index a9093f4..bd84d5c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,7 @@ //! use recmap::prelude::*; //! let seqlens = read_seqlens("hg38_seqlens.tsv") //! .expect("could not read seqlens"); -//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens) +//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", &seqlens) //! .expect("cannot read hapmap"); //! //! for (name, rate_map) in rec_map.iter() { @@ -26,12 +26,18 @@ //! ```bash //! cargo run --example calc_map_lengths -- --seqlens hg38_seqlens.tsv decode_2019_map.txt //! ``` +//! One of the most common tasks when working with recombination maps is to +//! estimate the map position of arbitrary markers, which is usually done by linear +//! interpolation. `RecMap` provides an easy way to do this for one position +//! (`RecMap.interpolate_map_position()`) and for many positions, with +//! `RecMap.interpolate_map_positions()`: +//! //! //! ```no_run //! use recmap::prelude::*; //! let seqlens = read_seqlens("hg38_seqlens.tsv") //! .expect("could not read seqlens"); -//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", seqlens) +//! let rec_map = RecMap::from_hapmap("decode_2019_map.txt", &seqlens) //! .expect("cannot read hapmap"); //! //! let positions = vec![11975064, 15007450]; @@ -43,10 +49,10 @@ mod file; mod numeric; pub mod recmap; -pub use recmap::{read_seqlens, RecMap, RecMapError}; +pub use recmap::{read_seqlens, RateFloat, RecMap, RecMapError, CM_MB_CONVERSION}; pub mod prelude { - pub use crate::recmap::{read_seqlens, RecMap, RecMapError}; + pub use crate::recmap::{read_seqlens, RateFloat, RecMap, RecMapError, CM_MB_CONVERSION}; } #[cfg(test)] diff --git a/src/numeric.rs b/src/numeric.rs index 88cd58a..503deba 100644 --- a/src/numeric.rs +++ b/src/numeric.rs @@ -28,7 +28,10 @@ where /// Assert to float values are the same up to `eps`. #[allow(dead_code)] -pub fn assert_floats_eq(left: &[f64], right: &[f64], eps: f64) { +pub fn assert_floats_eq(left: &[T], right: &[T], eps: T) +where + T: Float + Display, +{ assert_eq!(left.len(), right.len()); for (l, r) in left.iter().zip(right.iter()) { assert_float_eq(*l, *r, eps) diff --git a/src/recmap.rs b/src/recmap.rs index f1675d3..8b04673 100644 --- a/src/recmap.rs +++ b/src/recmap.rs @@ -1,9 +1,11 @@ -use csv::ReaderBuilder; use genomap::{GenomeMap, GenomeMapError}; use indexmap::map::IndexMap; use ndarray::Array1; +use num_traits::Float; use serde::{Deserialize, Serialize}; +use std::fmt::LowerExp; use std::io; +use std::io::BufRead; use std::io::Write; use thiserror::Error; @@ -13,7 +15,7 @@ use super::file::{FileError, InputFile}; use super::numeric::interp1d; /// The float type for recombination rates. -pub type RateFloat = f32; +pub type RateFloat = f64; /// The integer type for genomic positions. /// @@ -22,7 +24,8 @@ pub type RateFloat = f32; /// `i32::MAX` could change this through a `--feature`. pub type Position = u64; -const CM_MB_CONVERSION: RateFloat = 1e-8; +pub const CM_MB_CONVERSION: RateFloat = 1e-8; +pub const RATE_PRECISION: usize = 8; #[derive(Error, Debug)] pub enum RecMapError { @@ -46,6 +49,8 @@ pub enum RecMapError { LookupOutOfBounds(String, Position), #[error("Internal Error")] InternalError(String), + #[error("Recombination map overuns sequence length for {0} ({1} > {2})")] + LengthMismatch(String, Position, Position), #[error("GenomeMap Error: error updating GenomeMap")] GenomeMapError(#[from] GenomeMapError), } @@ -74,7 +79,7 @@ pub fn read_seqlens(filepath: &str) -> Result, csv::E } /// Storage and methods for a single chromosome's recombination rates and marker positions. -#[derive(Debug, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] pub struct RateMap { /// The n+1 genomic positions of the markers, 0-indexed and ending at the sequence length. pub ends: Vec, @@ -84,6 +89,37 @@ pub struct RateMap { pub map_pos: Vec, } +/// An iterator over the elements of a [`RateMap`] +pub struct RateMapIter { + ends: std::vec::IntoIter, + rates: std::vec::IntoIter, + map_pos: std::vec::IntoIter, +} + +impl Iterator for RateMapIter { + type Item = (Position, RateFloat, RateFloat); + + fn next(&mut self) -> Option { + match (self.ends.next(), self.rates.next(), self.map_pos.next()) { + (Some(end), Some(rate), Some(map_pos)) => Some((end, rate, map_pos)), + _ => None, + } + } +} + +impl IntoIterator for RateMap { + type Item = (Position, RateFloat, RateFloat); + type IntoIter = RateMapIter; + + fn into_iter(self) -> Self::IntoIter { + RateMapIter { + ends: self.ends.into_iter(), + rates: self.rates.into_iter(), + map_pos: self.map_pos.into_iter(), + } + } +} + impl RateMap { /// Create a new recombination rate map for a single chromosome. pub fn new() -> Self { @@ -96,7 +132,17 @@ impl RateMap { /// Returns the spans (i.e. widths) in basepairs between each marker. pub fn span(&self) -> Vec { - self.ends.windows(2).map(|pair| pair[1] - pair[0]).collect() + self.ends + .windows(2) + .map(|pair| { + assert!( + pair[1] >= pair[0], + "invalid positions encountered while calculating span: {:?}", + &pair + ); + pair[1] - pair[0] + }) + .collect() } /// Returns the cumulative mass between each marker. @@ -128,14 +174,21 @@ impl RateMap { } } +#[derive(Clone, Debug, Default, PartialEq)] pub struct RecMap { pub map: GenomeMap, + pub metadata: Option>, } impl RecMap { - /// Create a new [`RecMap`] from a HapMap-formatted recombination map file. + /// Create a new [`RecMap`] from a (possibly gzip-compressed) HapMap-formatted recombination map file. /// - /// This method also supports reading directly from a gzip-compressed file. + /// The HapMap recombination map format is (rather unfortunately) very poorly + /// specified. This parser is quite permissive, and skips through comment lines + /// that begin with `#` and a possible header line that beings with `Chr`. + /// Note that this parser *does not* read the cumulative map positions. Instead, + /// the cumulative map positions can be calculated directly from the rates and + /// the distances between markers. /// /// The HapMap recombination format looks like: /// @@ -154,39 +207,42 @@ impl RecMap { /// /// The HapMap format is well-described in the [tskit documentation](https://tskit.dev/msprime/docs/stable/api.html#msprime.RateMap.read_hapmap). /// - /// This parser is a bit more permissive -- it will allow no headers. + /// # Warnings + /// + /// Given that the HapMap recombination map format is poorly specified, and users often + /// implement their own versions, it is **highly recommended** that you always validate + /// the parsed recombination map. Ideally check that: + /// + /// - The total map length (i.e. in Morgans or centiMorgans) makes sense. Off-by-one + /// errors due to the format not following the 0-indexed end-exclusive format assumed + /// by [`RecMap.from_hapmap`] will often lead to obviously erroneous total map lengths. + /// - Visually plot the recombination map, looking for outlier rates. + /// - If the recombination map includes a fourth + /// /// pub fn from_hapmap( filepath: &str, - seqlens: IndexMap, + seqlens: &IndexMap, ) -> Result { - let input_file = InputFile::new(filepath); - - // read one line to check for headers - let has_header = input_file.has_header("Chromosome")?; - - let buf_reader = input_file.reader()?; + let mut input_file = InputFile::new(filepath); - let mut rdr = ReaderBuilder::new() - .delimiter(b'\t') - .has_headers(has_header) - .from_reader(buf_reader); + let _has_metadata = input_file.collect_metadata("#", "Chr")?; + let reader = input_file.continue_reading()?; let mut rec_map: GenomeMap = GenomeMap::new(); - let mut last_chrom: Option = None; let mut last_end: Option = None; - for result in rdr.records() { - let record = result.map_err(RecMapError::HapMapParsingError)?; + // this is used for validation only + let mut has_fourth_column = false; + let mut map_positions: Vec = Vec::new(); - // remove comment lines (TODO could store in metadata) - if record.get(0).map_or(false, |s| s.starts_with('#')) { - continue; - } + for result in reader.lines() { + let line = result?; + let fields: Vec<&str> = line.split_whitespace().collect(); // get the chrom column - let chrom = record.get(0).ok_or(RecMapError::MissingField)?.to_string(); + let chrom = fields.first().ok_or(RecMapError::MissingField)?.to_string(); // Our parser will always add on the chromosome end. Most times the // HapMap file will *not* have this, but we still check. @@ -202,7 +258,6 @@ impl RecMap { if let Some(&last_end) = chrom_entry.ends.last() { if last_end != *seq_len { chrom_entry.ends.push(*seq_len); - chrom_entry.rates.push(0.0); } } } @@ -216,7 +271,7 @@ impl RecMap { last_chrom = Some(chrom.clone()); // get the position and rate column, parsing into proper numeric types - let end_str = record.get(1).ok_or(RecMapError::MissingField)?; + let end_str = fields.get(1).ok_or(RecMapError::MissingField)?; let end: Position = end_str.parse().map_err(|_| { dbg!(&end_str); RecMapError::ParseError(format!("Failed to parse end from string: {}", end_str)) @@ -233,7 +288,7 @@ impl RecMap { } }; - let rate_str = record.get(2).ok_or(RecMapError::MissingField)?; + let rate_str = fields.get(2).ok_or(RecMapError::MissingField)?; let rate: RateFloat = rate_str.parse().map_err(|_| { RecMapError::ParseError(format!("Failed to parse rate from string: {}", rate_str)) })?; @@ -243,6 +298,18 @@ impl RecMap { return Err(RecMapError::ImproperRate(format!("{}:{}", chrom, end))); } + // if there is a fourth column (total map length) parse it + if let Some(map_pos_str) = fields.get(3) { + has_fourth_column = true; + let map_pos: RateFloat = map_pos_str.parse().map_err(|_| { + RecMapError::ParseError(format!( + "Failed to parse map position from string: {}", + map_pos_str + )) + })?; + map_positions.push(map_pos); + } + // HapMap rates are *always* in cM/Mb, but the natural unit is Morgans, so // we convert here. let rate = CM_MB_CONVERSION * rate; @@ -266,6 +333,13 @@ impl RecMap { new_rate_map.ends.push(end); new_rate_map.rates.push(rate); + // if there is a fourth column, let's use it for validation + if has_fourth_column { + //new_rate_map.calc_cumulative_mass(); + //assert_floats_eq(&new_rate_map.map_pos, map_positions.as_slice(), 0.01); + //map_positions.clear(); + } + rec_map.insert(&chrom, new_rate_map)?; } } @@ -273,16 +347,26 @@ impl RecMap { // Insert the final entry for the last chromosome outside the loop if needed if let Some(last) = last_chrom { if let Some(seq_len) = seqlens.get(&last) { - let chrom_entry = rec_map.entry_or_default(&last); + let chrom_entry = rec_map.get_mut(&last).expect( + "internal error: please report at http://github.com/vsbuffalo/recmap/issues", + ); // Assuming the rate is 0 for these extra entries if chrom_entry.ends.is_empty() || chrom_entry.ends.last().unwrap() != seq_len { + if chrom_entry.ends.last().unwrap() >= seq_len { + let last_end = *chrom_entry.ends.last().unwrap(); + return Err(RecMapError::LengthMismatch(last, last_end, *seq_len)); + } chrom_entry.ends.push(*seq_len); } } } - let mut rec_map = RecMap { map: rec_map }; + let metadata = input_file.comments; + let mut rec_map = RecMap { + map: rec_map, + metadata, + }; rec_map.generate_map_positions(); Ok(rec_map) } @@ -349,14 +433,71 @@ impl RecMap { Ok(Array1::from_vec(positions)) } + /// Write recombination map to HapMap-formatted file. + /// + /// This file has the usual HapMap recombination map header, and columns: + /// 1. Chromosome name + /// 2. Position (0-indexed and right-exclusive) + /// 3. Rate + /// 4. Map position + /// + /// # Arguments + /// * `filepath`: The filepath to write the recombination map to. If the filepath + /// has an `.gz` extension, the output will be gzip compressed. + /// If `filepath` is `None`, uncompressed output will be written to standard out. + pub fn write_hapmap(&self, filepath: Option<&str>) -> Result<(), RecMapError> { + let mut writer: Box = match filepath { + Some(path) => { + let file = OutputFile::new(path, None); + file.writer()? + } + None => { + // Use stdout if filepath is None + Box::new(std::io::stdout()) + } + }; + + // write that weird HapMap header + writeln!(writer, "Chromosome\tPosition(bp)\tRate(cM/Mb)\tMap(cM)")?; + + for (chrom, rate_map) in self.map.iter() { + // write the rows + let n = rate_map.ends.len(); + // TODO cut off end + let ends = &rate_map.ends[1..n - 1]; + + for (i, end) in ends.iter().enumerate() { + let rate = rate_map.rates[i + 1]; + let map_pos = rate_map.map_pos[i + 1]; + + // Write the record to the file + writeln!( + writer, + "{}\t{}\t{}\t{}", + chrom, + end, + format_float(rate / CM_MB_CONVERSION), + format_float(map_pos), + )?; + } + } + + Ok(()) + } + /// Write recombination map to a BED-like TSV file. /// + /// This file has columns: + /// 1. Chromosome name + /// 2. Start position (0-indexed) + /// 3. End position (0-indexed and right-exclusive) + /// 4. Rate + /// /// # Arguments /// * `filepath`: The filepath to write the recombination map to. If the filepath /// has an `.gz` extension, the output will be gzip compressed. /// If `filepath` is `None`, uncompressed output will be written to standard out. pub fn write_tsv(&self, filepath: Option<&str>) -> Result<(), RecMapError> { - let precision = 8; let mut writer: Box = match filepath { Some(path) => { let file = OutputFile::new(path, None); @@ -383,7 +524,7 @@ impl RecMap { // Write the record to the file let rate_rescaled: RateFloat = rate / CM_MB_CONVERSION; - let formatted_rate = format!("{:.1$e}", rate_rescaled, precision); + let formatted_rate = format!("{:.1$e}", rate_rescaled, RATE_PRECISION); // Write the record to the file writeln!( @@ -398,34 +539,108 @@ impl RecMap { } } +fn format_float(x: T) -> String +where + T: Float + LowerExp, +{ + format!("{:.1$e}", x, RATE_PRECISION) +} + #[cfg(test)] mod tests { - use crate::RecMap; + use super::Position; + use crate::{ + numeric::{assert_float_eq, assert_floats_eq}, + prelude::*, + }; + use indexmap::IndexMap; use tempfile::tempdir; - fn read_hapmap() -> RecMap { + fn mock_seqlens() -> IndexMap { let seqlens = indexmap::indexmap! { - "chr1".to_string() => 6980669, - "chr2".to_string() => 6004443, - "chr3".to_string() => 6026894, + "chr1".to_string() => 25, + "chr2".to_string() => 32, + "chr3".to_string() => 22, }; - let rec_map = RecMap::from_hapmap("tests_data/decode_2010_test_map.txt", seqlens).unwrap(); - - let dir = tempdir().unwrap(); - let output_path = dir.path().join("output.tsv"); + seqlens + } + fn read_hapmap() -> RecMap { + let seqlens = mock_seqlens(); + let rec_map = RecMap::from_hapmap("tests_data/test_hapmap.txt", &seqlens).unwrap(); rec_map - .write_tsv(Some(output_path.to_str().unwrap())) - .unwrap(); + } - dir.close().unwrap(); - rec_map + fn to_morgans(x: Vec) -> Vec { + x.iter().map(|v| v * CM_MB_CONVERSION).collect() } #[test] - fn test_hapmap_read() { + fn test_read_hapmap() { let rm = read_hapmap(); assert_eq!(rm.len(), 3); assert!(!rm.is_empty()); + + dbg!(&rm.map.get("chr1").unwrap().map_pos); + + let chr1_map = rm.map.get("chr1").unwrap(); + assert_eq!(chr1_map.ends.len(), chr1_map.rates.len() + 1); + assert_eq!(chr1_map.ends, vec![0, 10, 15, 20, 25]); + assert_eq!(chr1_map.rates, to_morgans(vec![0.0, 1.10, 1.50, 4.33])); + + // cumulative map calculation: + // [0-10): *implied 0.0* + // [10-15): 1.10 + // [15-20): 1.50 + // [20-25): 4.33 *to end* + // ---- + // 5bp * 1.10 = 5.5 + // 5bp * 1.5 = 7.5 + 5.5 = 13 + // 5bp * 4.33 = 21.65 + 13 = 34.65 + // total = 34.65 + + let total_len = *chr1_map.total_map_length().unwrap(); + assert_float_eq(total_len, 34.65 * CM_MB_CONVERSION, 1e-3); + assert_floats_eq( + &chr1_map.map_pos, + &to_morgans(vec![0.0, 5.5, 13., 34.65]), + 1e-3, + ); + } + + #[test] + #[ignore = "for debugging"] + fn test_write_hapmap_local() { + // this writes the output "locally" in the project directory + // for easier debugging. + let seqlens = mock_seqlens(); + + let rm = read_hapmap(); + + let filepath = "test_hapmap.txt"; + + rm.write_hapmap(Some(filepath)).unwrap(); + + let rm_readin = RecMap::from_hapmap(filepath, &seqlens).unwrap(); + + assert_eq!(rm_readin, rm); + } + + #[test] + fn test_write_hapmap() { + let seqlens = mock_seqlens(); + + let rm = read_hapmap(); + + // temp dir + let dir = tempdir().unwrap(); + let binding = dir.path().join("test_hapmap.txt"); + let filepath = binding.to_str().unwrap(); + + rm.write_hapmap(Some(filepath)).unwrap(); + + let rm_readin = RecMap::from_hapmap(filepath, &seqlens).unwrap(); + + assert_eq!(rm_readin, rm); } }