From 99ac17d05e03b9bf42ba0088900f577dbdfd43fe Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Tue, 5 Mar 2024 11:58:08 -0800 Subject: [PATCH] Added new test utilities, increased default BED test file length. - New `copy_tempfile_for_inspection()` function for inspecting tempfiles that fail tests. --- src/commands.rs | 16 +++---- src/test_utilities.rs | 81 ++++++++++++++++++++++++------------ tests/bedtools_validation.rs | 18 ++++++-- 3 files changed, 77 insertions(+), 38 deletions(-) diff --git a/src/commands.rs b/src/commands.rs index f34c634..8600007 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -751,8 +751,8 @@ pub struct HistFeatures { #[arg(short, long)] chop: bool, - /// Assign each basepair exclusively to a single "feature set" - /// based on what features overlap it, and count the number of + /// Assign each basepair exclusively to a single "feature set" + /// based on what features overlap it, and count the number of /// these per window. E.g. a region that overlaps "CDS" and "exon" /// will be called a "CDS,exon" feature. Columns are sorted /// in descending order by column sums. Headers will always be @@ -924,8 +924,10 @@ impl HistFeatures { .collect(); // Re-order the headers too by the sorted indices. - let feature_sets: Vec<_> = sorted_indices.iter() - .map(|i| feature_sets[*i].clone()).collect(); + let feature_sets: Vec<_> = sorted_indices + .iter() + .map(|i| feature_sets[*i].clone()) + .collect(); let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()]; headers.extend(feature_sets); @@ -940,8 +942,7 @@ impl HistFeatures { } } - -// get column totals +// get column totals fn column_totals(matrix: &Vec>) -> Vec { if matrix.is_empty() || matrix[0].is_empty() { return Vec::new(); @@ -959,7 +960,7 @@ fn column_totals(matrix: &Vec>) -> Vec { } totals -} +} // get descending indices order fn sorted_indices_by_values(values: &[Position]) -> Vec { @@ -967,4 +968,3 @@ fn sorted_indices_by_values(values: &[Position]) -> Vec { indices.sort_by_key(|&i| std::cmp::Reverse(values[i])); indices } - diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 38dcbec..0f9f70c 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -1,7 +1,13 @@ //! Test cases and test utility functions. //! -use std::{io::BufRead, path::PathBuf}; +use std::{ + env, + fs::{self, copy}, + io::BufRead, + path::PathBuf, +}; +use tempfile::{Builder, NamedTempFile}; use crate::{ commands::granges_random_bed, @@ -22,12 +28,16 @@ use genomap::GenomeMap; use indexmap::IndexMap; #[cfg(feature = "ndarray")] use ndarray::{Array1, Array2}; -use rand::{rngs::StdRng, distributions::Uniform, seq::SliceRandom, Rng, SeedableRng}; - -// Set the seed for reproducible tests. -const SEED: u64 = 4; - -use tempfile::{Builder, NamedTempFile}; +use rand::{distributions::Uniform, rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + +// Use a default random seed, unless one is in the environment. +fn get_rng() -> StdRng { + let seed: u64 = env::var("TEST_SEED") + .unwrap_or_else(|_| "13".to_string()) + .parse() + .expect("TEST_SEED must be a valid u64"); + StdRng::seed_from_u64(seed) +} /// Get the path to the `grange` command line tool after a build. /// This is used for integration tests and benchmarks. @@ -62,31 +72,29 @@ pub const MAX_CHROM_LEN: Position = 250_000_000; /// Build a random range start/end on a sequence of `max_len`. /// 0-indexed, right exclusive -pub fn random_range(chrom_len: Position) -> (Position, Position) { - let mut rng = StdRng::seed_from_u64(SEED); +pub fn random_range(chrom_len: Position, rng: &mut StdRng) -> (Position, Position) { let len = rng.gen_range(MIN_LEN..MAX_LEN); let start = rng.gen_range(0..chrom_len - len + 1); (start, start + len) } /// Build random sequence lengths -pub fn random_seqlen() -> Position { - let mut rng = StdRng::seed_from_u64(SEED); +pub fn random_seqlen(rng: &mut StdRng) -> Position { rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN) } /// Sample a random chromosome -pub fn random_chrom() -> String { - let mut rng = StdRng::seed_from_u64(SEED); +pub fn random_chrom(rng: &mut StdRng) -> String { format!("chr{}", rng.gen_range(1..NCHROM + 1)) } /// Build random [`VecRanges`] pub fn random_vecranges(n: usize) -> VecRanges { - let seqlen = random_seqlen(); + let mut rng = get_rng(); + let seqlen = random_seqlen(&mut rng); let mut vr = VecRanges::new(seqlen); for _i in 0..n { - let (start, end) = random_range(seqlen); + let (start, end) = random_range(seqlen, &mut rng); let range = RangeEmpty::new(start, end); vr.push_range(range); } @@ -99,7 +107,7 @@ pub fn random_granges( seqlens: &IndexMap, num: usize, ) -> Result, GRangesError> { - let mut rng = StdRng::seed_from_u64(SEED); + let mut rng = get_rng(); let mut gr = GRangesEmpty::new_vec(seqlens); @@ -109,15 +117,14 @@ pub fn random_granges( let chrom_len = *seqlens .get(seqname) .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?; - let (start, end) = random_range(chrom_len); + let (start, end) = random_range(chrom_len, &mut rng); gr.push_range(seqname, start, end)?; } Ok(gr) } /// Generate random strings, e.g. for mock feature names. -fn generate_random_string(n: usize) -> String { - let mut rng = StdRng::seed_from_u64(SEED); +fn generate_random_string(n: usize, rng: &mut StdRng) -> String { let letters: Vec = ('a'..='z').collect(); let letters_dist = Uniform::from(0..letters.len()); @@ -125,8 +132,7 @@ fn generate_random_string(n: usize) -> String { } /// Generate a random float value, e.g. for a mock BED "score". -fn generate_random_uniform(start: f64, end: f64) -> f64 { - let mut rng = StdRng::seed_from_u64(SEED); +fn generate_random_uniform(start: f64, end: f64, rng: &mut StdRng) -> f64 { let uniform = Uniform::new(start, end); // Specify the range rng.sample(uniform) } @@ -137,7 +143,7 @@ pub fn random_granges_mock_bed5( seqlens: &IndexMap, num: usize, ) -> Result>, GRangesError> { - let mut rng = StdRng::seed_from_u64(SEED); + let mut rng = get_rng(); let mut gr = GRanges::new_vec(seqlens); @@ -147,10 +153,10 @@ pub fn random_granges_mock_bed5( let chrom_len = *seqlens .get(seqname) .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?; - let (start, end) = random_range(chrom_len); + let (start, end) = random_range(chrom_len, &mut rng); let bed5_cols = Bed5Addition { - name: generate_random_string(8), - score: Some(generate_random_uniform(0.0, 1.0)), + name: generate_random_string(8, &mut rng), + score: Some(generate_random_uniform(0.0, 1.0, &mut rng)), }; gr.push_range(seqname, start, end, bed5_cols)?; } @@ -166,10 +172,31 @@ pub fn random_coitrees() -> COITrees<()> { /// Get a temporary file with a .bed suffix. pub fn temp_bedfile() -> NamedTempFile { - Builder::new() + let tempfile = Builder::new() .suffix(".bed") .tempfile() - .expect("Failed to create temp file") + .expect("Failed to create temp file"); + tempfile +} + +/// Copy a temp file for inspection, allowing the specification of a custom temporary directory name. +// AICODE -- co-written with AI +pub fn copy_tempfile_for_inspection(tempfile: impl Into, new_file_name: &str) -> PathBuf { + let tempfile = tempfile.into(); + // Check if the TEST_LOCAL environment variable is set + if env::var("TEST_LOCAL").is_ok() { + let dir_path = PathBuf::from("test_temp"); + fs::create_dir_all(&dir_path).expect("could not create test_temp/"); + + let dest_path = dir_path.join(new_file_name); + + copy(tempfile, &dest_path).expect("could not copy temp file"); + + dest_path + } else { + // If TEST_LOCAL is not set, simply return the path of the original temp file + tempfile + } } /// Create a random BED3 file based on the hg38 sequence lengths, and write to disk. diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 09dc639..67b0f0a 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -93,7 +93,7 @@ fn validate_bedfloats( // adjustable test sizes -- useful also for making very small // so string diffs are easier to compare. #[cfg(not(feature = "bench-big"))] -const BED_LENGTH: usize = 10; +const BED_LENGTH: usize = 100_000; #[cfg(feature = "bench-big")] const BED_LENGTH: usize = 1_000_000; @@ -322,6 +322,10 @@ fn test_against_bedtools_makewindows() { let widths = vec![131123, 1_000_0013]; let steps = vec![10_001, 10_113]; + // smaller test set: + // let widths = vec![1_000_0013]; + // let steps = vec![10000]; + for width in widths.iter() { for step in steps.iter() { let bedtools_output = Command::new("bedtools") @@ -441,8 +445,8 @@ fn test_against_bedtools_map_multiple() { // try multiple operations at once let num_ranges = BED_LENGTH; let width = 1_000_000; - #[allow(unused_variables)] - let step = 10_000; // can uncomment lines below to test this + // #[allow(unused_variables)] + // let step = 10_000; // can uncomment lines below to test this // make windows let windows_file = temp_bedfile(); @@ -464,8 +468,11 @@ fn test_against_bedtools_map_multiple() { granges_windows_output ); + // copy_tempfile_for_inspection(&windows_file.path(), "windows.bed"); + // create the random data BED5 let bedscores_file = random_bed5file(num_ranges); + // copy_tempfile_for_inspection(&bedscores_file.path(), "scores.bed"); let bedtools_path = temp_bedfile(); let bedtools_output_file = File::create(&bedtools_path).unwrap(); @@ -485,6 +492,8 @@ fn test_against_bedtools_map_multiple() { .output() .expect("bedtools map failed"); + // copy_tempfile_for_inspection(&bedtools_path.path(), "bedtools.bed"); + let granges_output_file = temp_bedfile(); let granges_output = Command::new(granges_binary_path()) .arg("map") @@ -501,6 +510,8 @@ fn test_against_bedtools_map_multiple() { .output() .expect("granges map failed"); + // copy_tempfile_for_inspection(&granges_output_file.path(), "granges.bed"); + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); @@ -539,6 +550,7 @@ fn test_against_bedtools_map_multiple() { .iter() .zip(bedtools_data.iter()) .for_each(|(gr, bd)| { + // dbg!((gr.min, bd.min)); assert_option_float_tol!(gr.min, bd.min); assert_option_float_tol!(gr.max, bd.max);