From b184631d83ae132a503ab1cfaf428276c3529744 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Tue, 5 Mar 2024 11:58:08 -0800 Subject: [PATCH] Added new test utilities, increased default BED test file length. - New `copy_tempfile_for_inspection()` function for inspecting tempfiles that fail tests. - Added `scripts/seed_grinder.sh` which tries random seeds looking for failures. - Fixed minor bug where u32 was being used instead of `Position`. --- Cargo.toml | 4 +- scripts/seed_grinder.sh | 16 ++ src/commands.rs | 307 +++++++++++++++++++---------------- src/main/mod.rs | 14 +- src/test_utilities.rs | 81 ++++++--- tests/bedtools_validation.rs | 18 +- 6 files changed, 265 insertions(+), 175 deletions(-) create mode 100644 scripts/seed_grinder.sh diff --git a/Cargo.toml b/Cargo.toml index 95bb0e6..aac5ff7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "granges" -version = "0.2.1" +version = "0.2.2" edition = "2021" license = "MIT" authors = ["Vince Buffalo "] @@ -12,7 +12,7 @@ description = "A Rust library and command line tool for genomic range operations [dependencies] # clap = { version = "4.4.18", features = ["derive"], optional = true } -clap = { version = "4.4.18", features = ["derive"] } +clap = { version = "4.4.18", features = ["derive", "wrap_help"] } coitrees = { version = "0.4.0", features = ["nosimd"] } flate2 = { version = "1.0.28", features = ["zlib-ng-compat"] } genomap = "0.2.6" diff --git a/scripts/seed_grinder.sh b/scripts/seed_grinder.sh new file mode 100644 index 0000000..6ee6eb1 --- /dev/null +++ b/scripts/seed_grinder.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +START_SEED=1 +END_SEED=10000 + +for SEED in $(seq $START_SEED $END_SEED); do + echo "Running tests with TEST_SEED=$SEED" + TEST_SEED=$SEED cargo test --test bedtools_validation + if [ $? -ne 0 ]; then + echo "Tests failed with TEST_SEED=$SEED" + exit 1 + else + echo "Tests passed with TEST_SEED=$SEED" + fi +done + diff --git a/src/commands.rs b/src/commands.rs index f34c634..771135e 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -718,19 +718,29 @@ fn transpose(v: Vec>) -> Vec> { .collect() } -/// Histogram a set of features for a defined window size. +/// Calculate the density of features in a BED4 file, per window. There are two +/// modes: +/// +/// 1. Without --exclusive (default): a basepair overlapping two features will +/// increment the counts of both. +/// +/// 2. With --exclusive: a basepair overlapping two features will be assigned to +/// a new composite "feature set" of the two features, and increment the +/// count of that. In contrast to the default case, this means every overlapping +/// feature in a window is assigned exclusively to one feature set. /// /// # Tips /// This is useful for a quick exploratory look at feature density /// by window, e.g. with /// -/// $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \ +/// $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome /// hg38_seqlens.tsv --headers | xsv table -d'\t' | less /// /// The xsv tool (https://github.com/BurntSushi/xsv) is useful for visualizing /// the results more clearly. +/// #[derive(Parser)] -pub struct HistFeatures { +pub struct FeatureDensity { /// A TSV genome file of chromosome names and their lengths #[arg(short, long, required = true)] genome: PathBuf, @@ -751,8 +761,8 @@ pub struct HistFeatures { #[arg(short, long)] chop: bool, - /// Assign each basepair exclusively to a single "feature set" - /// based on what features overlap it, and count the number of + /// Assign each basepair exclusively to a single "feature set" + /// based on what features overlap it, and count the number of /// these per window. E.g. a region that overlaps "CDS" and "exon" /// will be called a "CDS,exon" feature. Columns are sorted /// in descending order by column sums. Headers will always be @@ -773,66 +783,171 @@ pub struct HistFeatures { output: Option, } -impl HistFeatures { - pub fn run(&self) -> Result, GRangesError> { +type GRangesFeatureMatrix = GRanges>>; + +impl FeatureDensity { + /// Calculate feature density per window, with non-exclusive assignment + /// of basepairs to features. E.g. a basepair that overlaps + /// "CDS" and "exon" features will be added to the tallies of both. + pub fn feature_density(&self) -> Result<(GRangesFeatureMatrix, Vec), GRangesError> { let bedfile = &self.bedfile; let genome = read_seqlens(&self.genome)?; let bed4_iter = Bed4Iterator::new(bedfile)?; - if !self.exclusive { - // Split the elements in the iterator by feature into multiple GRanges objects. - let mut records_by_features: HashMap> = - HashMap::new(); - for result in bed4_iter { - let range = result?; - let feature = &range.data.name; - let vec = records_by_features.entry(feature.to_string()).or_default(); - vec.push(range.into_empty()) // drop the feature, since we're hashing on it - } + // Split the elements in the iterator by feature into multiple GRanges objects. + let mut records_by_features: HashMap> = + HashMap::new(); + for result in bed4_iter { + let range = result?; + let feature = &range.data.name; + let vec = records_by_features.entry(feature.to_string()).or_default(); + vec.push(range.into_empty()) // drop the feature, since we're hashing on it + } + + // Load all the *merged* split records into memory as interval trees. + let mut gr_by_features: HashMap> = HashMap::new(); + for (feature, ranges) in records_by_features.into_iter() { + // doing a streaming merge of each feature; bookend merge + let merging_iter = MergingEmptyIterator::new(ranges, 0); + + // load into memory and convert to interval trees + let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?; + + assert!(!gr_by_features.contains_key(&feature)); + gr_by_features.insert(feature, gr); + } + + // Now, create windows. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + + let features: Vec<_> = gr_by_features.keys().cloned().collect(); + let mut feature_matrix = Vec::new(); + for (_feature, gr) in gr_by_features.into_iter() { + // find the features that overlap each window + // TODO/OPTIMIZE: how costly is this clone? + let window_counts = windows + .clone() + .left_overlaps(&gr)? + .map_joins(|joins| { + // these are merged, so this is the number of *unique* basepairs + let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum(); + total_overlaps + })? + .take_data()?; + feature_matrix.push(window_counts); + } - // Load all the *merged* split records into memory as interval trees. - let mut gr_by_features: HashMap> = HashMap::new(); - for (feature, ranges) in records_by_features.into_iter() { - // doing a streaming merge of each feature; bookend merge - let merging_iter = MergingEmptyIterator::new(ranges, 0); + let feature_matrix = transpose(feature_matrix); - // load into memory and convert to interval trees - let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?; + // Unite the windows with the feature matrix. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + let windows = windows.into_granges_data(feature_matrix)?; + Ok((windows, features)) + } - assert!(!gr_by_features.contains_key(&feature)); - gr_by_features.insert(feature, gr); + /// Calculate feature density, but assign each basepair exclusively to a single + /// "feature set", which is the unique set of features that a particular + /// basepair overlaps. + pub fn feature_density_exclusive(&self) -> Result<(GRangesFeatureMatrix, Vec), GRangesError> { + let bedfile = &self.bedfile; + let genome = read_seqlens(&self.genome)?; + let bed4_iter = Bed4Iterator::new(bedfile)?; + + // Create GRanges of the feature indices. + // We do this manually to avoid creating a needless data container. + let mut gr = GRanges::new_vec_keyed(&genome); + for result in bed4_iter { + let range = result?; + + // Note the trick here: we are *re-using* indices. + gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?; + } + let gr = gr.into_coitrees()?; + + // clone the feature map + let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone(); + + // Create windows. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + + // "Reduce" ranges and + let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| { + // "reduce" the ranges to a minimum spanning set that contains a vec of indices + let ranges = join.join.reduce_ranges(); + let mut feature_overlaps: HashMap, _> = HashMap::new(); + for range in ranges.iter() { + let mut key: Vec = range.indices().iter().map(|x| x.unwrap()).collect(); + key.sort(); // sorted key is compound key + *feature_overlaps.entry(key).or_insert(0) += range.width(); } - // Now, create windows. - let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; - - let features: Vec<_> = gr_by_features.keys().cloned().collect(); - let mut feature_matrix = Vec::new(); - for (_feature, gr) in gr_by_features.into_iter() { - // find the features that overlap each window - // TODO/OPTIMIZE: how costly is this clone? - let window_counts = windows - .clone() - .left_overlaps(&gr)? - .map_joins(|joins| { - // these are merged, so this is the number of *unique* basepairs - let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum(); - total_overlaps - })? - .take_data()?; - feature_matrix.push(window_counts); + feature_overlaps + })?; + + // Now, we go over the data and get the observed sets + let mut observed_sets = UniqueIdentifier::new(); + let data = windows_overlaps + .data() + .ok_or(GRangesError::NoDataContainer)?; + for feature_counts in data.iter() { + for feature_set in feature_counts.keys() { + observed_sets.get_or_insert(feature_set); + } + } + let nsets = observed_sets.len(); // total feature sets + + // Go over data again, making the sparse hashmap into a full + // matrix of overlappng basepairs. + let window_counts = windows_overlaps.map_data(|feature_counts| { + // fill out matrix + let mut counts = vec![0; nsets]; + for (i, feature_set) in observed_sets.keys().enumerate() { + counts[i] = *feature_counts.get(feature_set).unwrap_or(&0); } + counts + })?; + + // To make this prettier, let's sort by column totals (this + // is a little pricey; we can make option later if needed). + let matrix = window_counts.data().ok_or(GRangesError::NoDataContainer)?; + let col_totals = column_totals(matrix); + let sorted_indices = sorted_indices_by_values(&col_totals); + + let window_counts = window_counts.map_data(|row| { + let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect(); + row + })?; + + // Create the labels. + let feature_sets: Vec = observed_sets + .keys() + .map(|indices_key| { + let labels: Vec<_> = indices_key + .iter() + .map(|idx| feature_map.get_key(*idx).unwrap().clone()) + .collect(); + labels.join(",") + }) + .collect(); - let feature_matrix = transpose(feature_matrix); + // Re-order the headers too by the sorted indices. + let feature_sets: Vec<_> = sorted_indices + .iter() + .map(|i| feature_sets[*i].clone()) + .collect(); - // Unite the windows with the feature matrix. - let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; - let windows = windows.into_granges_data(feature_matrix)?; + Ok((window_counts, feature_sets)) + } + + /// Run this command given the command line interface. + pub fn run(&self) -> Result, GRangesError> { + if !self.exclusive { + let (window_counts, features) = self.feature_density()?; // Write everything. if !self.headers { // we have to *manually* add headers (serde needs flat structs otherwise) - windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?; + window_counts.write_to_tsv(self.output.as_ref(), &BED_TSV)?; } else { let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()]; headers.extend(features); @@ -841,92 +956,10 @@ impl HistFeatures { no_value_string: "NA".to_string(), headers: Some(headers), }; - windows.write_to_tsv(self.output.as_ref(), &config)?; + window_counts.write_to_tsv(self.output.as_ref(), &config)?; } } else { - // This is done a bit differently than the non-classify case - - // Create GRanges of the feature indices. - // We do this manually to avoid creating a needless data container. - let mut gr = GRanges::new_vec_keyed(&genome); - for result in bed4_iter { - let range = result?; - - // Note the trick here: we are *re-using* indices. - gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?; - } - let gr = gr.into_coitrees()?; - - // clone the feature map - let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone(); - - // Create windows. - let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; - - // "Reduce" ranges and - let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| { - // "reduce" the ranges to a minimum spanning set that contains a vec of indices - let ranges = join.join.reduce_ranges(); - let mut feature_overlaps: HashMap, _> = HashMap::new(); - for range in ranges.iter() { - let mut key: Vec = range.indices().iter().map(|x| x.unwrap()).collect(); - key.sort(); // sorted key is compound key - *feature_overlaps.entry(key).or_insert(0) += range.width(); - } - - feature_overlaps - })?; - - // Now, we go over the data and get the observed sets - let mut observed_sets = UniqueIdentifier::new(); - let data = windows_overlaps - .data() - .ok_or(GRangesError::NoDataContainer)?; - for feature_counts in data.iter() { - for feature_set in feature_counts.keys() { - observed_sets.get_or_insert(feature_set); - } - } - let nsets = observed_sets.len(); // total feature sets - - // Go over data again, making the sparse hashmap into a full - // matrix of overlappng basepairs. - let gr = windows_overlaps.map_data(|feature_counts| { - // fill out matrix - let mut counts = vec![0; nsets]; - for (i, feature_set) in observed_sets.keys().enumerate() { - counts[i] = *feature_counts.get(feature_set).unwrap_or(&0); - } - counts - })?; - - // To make this prettier, let's sort by column totals (this - // is a little pricey; we can make option later if needed). - let matrix = gr.data().ok_or(GRangesError::NoDataContainer)?; - let col_totals = column_totals(matrix); - let sorted_indices = sorted_indices_by_values(&col_totals); - - let gr = gr.map_data(|row| { - let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect(); - row - })?; - - // Create the labels. - let feature_sets: Vec = observed_sets - .keys() - .map(|indices_key| { - let labels: Vec<_> = indices_key - .iter() - .map(|idx| feature_map.get_key(*idx).unwrap().clone()) - .collect(); - labels.join(",") - }) - .collect(); - - // Re-order the headers too by the sorted indices. - let feature_sets: Vec<_> = sorted_indices.iter() - .map(|i| feature_sets[*i].clone()).collect(); - + let (window_counts, feature_sets) = self.feature_density_exclusive()?; let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()]; headers.extend(feature_sets); @@ -934,14 +967,13 @@ impl HistFeatures { no_value_string: "NA".to_string(), headers: Some(headers), }; - gr.write_to_tsv(self.output.as_ref(), &config)?; + window_counts.write_to_tsv(self.output.as_ref(), &config)?; } Ok(CommandOutput::new((), None)) } } - -// get column totals +// get column totals fn column_totals(matrix: &Vec>) -> Vec { if matrix.is_empty() || matrix[0].is_empty() { return Vec::new(); @@ -959,7 +991,7 @@ fn column_totals(matrix: &Vec>) -> Vec { } totals -} +} // get descending indices order fn sorted_indices_by_values(values: &[Position]) -> Vec { @@ -967,4 +999,3 @@ fn sorted_indices_by_values(values: &[Position]) -> Vec { indices.sort_by_key(|&i| std::cmp::Reverse(values[i])); indices } - diff --git a/src/main/mod.rs b/src/main/mod.rs index 70c6437..f63f0b8 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -4,7 +4,7 @@ use clap::{Parser, Subcommand}; use granges::{ commands::{ granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, FilterChroms, - HistFeatures, Merge, ProcessingMode, + FeatureDensity, Merge, ProcessingMode, }, data::operations::FloatOperation, prelude::GRangesError, @@ -26,8 +26,12 @@ Subcommands: overlap with a right range. This is equivalent to a filtering "semi-join" in SQL terminology. - hist-features: Histogram feature coverage per windows. This merges overlapping - ranges first. + feature-density Calculate the density of features per window, e.g. how many + basepairs are "exon", "CDS", etc. With --exclusive, this will assign + each basepair exclusively to a unique "feature set", such that + if a basepair overlaps "CDS" and "exon" ranges, the overlapping + number of basepairs will be added to a new composite "CDS,exon" + feature set. map: Compute the left grouped overlaps between the left genomic ranges and right genomic ranges, and apply one or more operations to the @@ -146,7 +150,7 @@ enum Commands { #[arg(long)] in_mem: bool, }, - HistFeatures(HistFeatures), + FeatureDensity(FeatureDensity), /// Do a "left grouped join", on the specified left and right genomic ranges, /// and apply one or more functions to the BED5 scores for all right genomic /// ranges. @@ -313,7 +317,7 @@ fn run() -> Result<(), GRangesError> { ) } // NOTE: this is the new API, so clean! - Some(Commands::HistFeatures(hist_features)) => hist_features.run(), + Some(Commands::FeatureDensity(density)) => density.run(), Some(Commands::Merge(merge)) => merge.run(), Some(Commands::Windows { genome, diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 38dcbec..0f9f70c 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -1,7 +1,13 @@ //! Test cases and test utility functions. //! -use std::{io::BufRead, path::PathBuf}; +use std::{ + env, + fs::{self, copy}, + io::BufRead, + path::PathBuf, +}; +use tempfile::{Builder, NamedTempFile}; use crate::{ commands::granges_random_bed, @@ -22,12 +28,16 @@ use genomap::GenomeMap; use indexmap::IndexMap; #[cfg(feature = "ndarray")] use ndarray::{Array1, Array2}; -use rand::{rngs::StdRng, distributions::Uniform, seq::SliceRandom, Rng, SeedableRng}; - -// Set the seed for reproducible tests. -const SEED: u64 = 4; - -use tempfile::{Builder, NamedTempFile}; +use rand::{distributions::Uniform, rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + +// Use a default random seed, unless one is in the environment. +fn get_rng() -> StdRng { + let seed: u64 = env::var("TEST_SEED") + .unwrap_or_else(|_| "13".to_string()) + .parse() + .expect("TEST_SEED must be a valid u64"); + StdRng::seed_from_u64(seed) +} /// Get the path to the `grange` command line tool after a build. /// This is used for integration tests and benchmarks. @@ -62,31 +72,29 @@ pub const MAX_CHROM_LEN: Position = 250_000_000; /// Build a random range start/end on a sequence of `max_len`. /// 0-indexed, right exclusive -pub fn random_range(chrom_len: Position) -> (Position, Position) { - let mut rng = StdRng::seed_from_u64(SEED); +pub fn random_range(chrom_len: Position, rng: &mut StdRng) -> (Position, Position) { let len = rng.gen_range(MIN_LEN..MAX_LEN); let start = rng.gen_range(0..chrom_len - len + 1); (start, start + len) } /// Build random sequence lengths -pub fn random_seqlen() -> Position { - let mut rng = StdRng::seed_from_u64(SEED); +pub fn random_seqlen(rng: &mut StdRng) -> Position { rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN) } /// Sample a random chromosome -pub fn random_chrom() -> String { - let mut rng = StdRng::seed_from_u64(SEED); +pub fn random_chrom(rng: &mut StdRng) -> String { format!("chr{}", rng.gen_range(1..NCHROM + 1)) } /// Build random [`VecRanges`] pub fn random_vecranges(n: usize) -> VecRanges { - let seqlen = random_seqlen(); + let mut rng = get_rng(); + let seqlen = random_seqlen(&mut rng); let mut vr = VecRanges::new(seqlen); for _i in 0..n { - let (start, end) = random_range(seqlen); + let (start, end) = random_range(seqlen, &mut rng); let range = RangeEmpty::new(start, end); vr.push_range(range); } @@ -99,7 +107,7 @@ pub fn random_granges( seqlens: &IndexMap, num: usize, ) -> Result, GRangesError> { - let mut rng = StdRng::seed_from_u64(SEED); + let mut rng = get_rng(); let mut gr = GRangesEmpty::new_vec(seqlens); @@ -109,15 +117,14 @@ pub fn random_granges( let chrom_len = *seqlens .get(seqname) .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?; - let (start, end) = random_range(chrom_len); + let (start, end) = random_range(chrom_len, &mut rng); gr.push_range(seqname, start, end)?; } Ok(gr) } /// Generate random strings, e.g. for mock feature names. -fn generate_random_string(n: usize) -> String { - let mut rng = StdRng::seed_from_u64(SEED); +fn generate_random_string(n: usize, rng: &mut StdRng) -> String { let letters: Vec = ('a'..='z').collect(); let letters_dist = Uniform::from(0..letters.len()); @@ -125,8 +132,7 @@ fn generate_random_string(n: usize) -> String { } /// Generate a random float value, e.g. for a mock BED "score". -fn generate_random_uniform(start: f64, end: f64) -> f64 { - let mut rng = StdRng::seed_from_u64(SEED); +fn generate_random_uniform(start: f64, end: f64, rng: &mut StdRng) -> f64 { let uniform = Uniform::new(start, end); // Specify the range rng.sample(uniform) } @@ -137,7 +143,7 @@ pub fn random_granges_mock_bed5( seqlens: &IndexMap, num: usize, ) -> Result>, GRangesError> { - let mut rng = StdRng::seed_from_u64(SEED); + let mut rng = get_rng(); let mut gr = GRanges::new_vec(seqlens); @@ -147,10 +153,10 @@ pub fn random_granges_mock_bed5( let chrom_len = *seqlens .get(seqname) .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?; - let (start, end) = random_range(chrom_len); + let (start, end) = random_range(chrom_len, &mut rng); let bed5_cols = Bed5Addition { - name: generate_random_string(8), - score: Some(generate_random_uniform(0.0, 1.0)), + name: generate_random_string(8, &mut rng), + score: Some(generate_random_uniform(0.0, 1.0, &mut rng)), }; gr.push_range(seqname, start, end, bed5_cols)?; } @@ -166,10 +172,31 @@ pub fn random_coitrees() -> COITrees<()> { /// Get a temporary file with a .bed suffix. pub fn temp_bedfile() -> NamedTempFile { - Builder::new() + let tempfile = Builder::new() .suffix(".bed") .tempfile() - .expect("Failed to create temp file") + .expect("Failed to create temp file"); + tempfile +} + +/// Copy a temp file for inspection, allowing the specification of a custom temporary directory name. +// AICODE -- co-written with AI +pub fn copy_tempfile_for_inspection(tempfile: impl Into, new_file_name: &str) -> PathBuf { + let tempfile = tempfile.into(); + // Check if the TEST_LOCAL environment variable is set + if env::var("TEST_LOCAL").is_ok() { + let dir_path = PathBuf::from("test_temp"); + fs::create_dir_all(&dir_path).expect("could not create test_temp/"); + + let dest_path = dir_path.join(new_file_name); + + copy(tempfile, &dest_path).expect("could not copy temp file"); + + dest_path + } else { + // If TEST_LOCAL is not set, simply return the path of the original temp file + tempfile + } } /// Create a random BED3 file based on the hg38 sequence lengths, and write to disk. diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 09dc639..67b0f0a 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -93,7 +93,7 @@ fn validate_bedfloats( // adjustable test sizes -- useful also for making very small // so string diffs are easier to compare. #[cfg(not(feature = "bench-big"))] -const BED_LENGTH: usize = 10; +const BED_LENGTH: usize = 100_000; #[cfg(feature = "bench-big")] const BED_LENGTH: usize = 1_000_000; @@ -322,6 +322,10 @@ fn test_against_bedtools_makewindows() { let widths = vec![131123, 1_000_0013]; let steps = vec![10_001, 10_113]; + // smaller test set: + // let widths = vec![1_000_0013]; + // let steps = vec![10000]; + for width in widths.iter() { for step in steps.iter() { let bedtools_output = Command::new("bedtools") @@ -441,8 +445,8 @@ fn test_against_bedtools_map_multiple() { // try multiple operations at once let num_ranges = BED_LENGTH; let width = 1_000_000; - #[allow(unused_variables)] - let step = 10_000; // can uncomment lines below to test this + // #[allow(unused_variables)] + // let step = 10_000; // can uncomment lines below to test this // make windows let windows_file = temp_bedfile(); @@ -464,8 +468,11 @@ fn test_against_bedtools_map_multiple() { granges_windows_output ); + // copy_tempfile_for_inspection(&windows_file.path(), "windows.bed"); + // create the random data BED5 let bedscores_file = random_bed5file(num_ranges); + // copy_tempfile_for_inspection(&bedscores_file.path(), "scores.bed"); let bedtools_path = temp_bedfile(); let bedtools_output_file = File::create(&bedtools_path).unwrap(); @@ -485,6 +492,8 @@ fn test_against_bedtools_map_multiple() { .output() .expect("bedtools map failed"); + // copy_tempfile_for_inspection(&bedtools_path.path(), "bedtools.bed"); + let granges_output_file = temp_bedfile(); let granges_output = Command::new(granges_binary_path()) .arg("map") @@ -501,6 +510,8 @@ fn test_against_bedtools_map_multiple() { .output() .expect("granges map failed"); + // copy_tempfile_for_inspection(&granges_output_file.path(), "granges.bed"); + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); assert!(granges_output.status.success(), "{:?}", granges_output); @@ -539,6 +550,7 @@ fn test_against_bedtools_map_multiple() { .iter() .zip(bedtools_data.iter()) .for_each(|(gr, bd)| { + // dbg!((gr.min, bd.min)); assert_option_float_tol!(gr.min, bd.min); assert_option_float_tol!(gr.max, bd.max);