From b184631d83ae132a503ab1cfaf428276c3529744 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Tue, 5 Mar 2024 11:58:08 -0800
Subject: [PATCH] Added new test utilities, increased default BED test file
 length.  - New `copy_tempfile_for_inspection()` function for inspecting   
 tempfiles that fail tests.  - Added `scripts/seed_grinder.sh` which tries
 random seeds looking    for failures.  - Fixed minor bug where u32 was being
 used instead of `Position`.

---
 Cargo.toml                   |   4 +-
 scripts/seed_grinder.sh      |  16 ++
 src/commands.rs              | 307 +++++++++++++++++++----------------
 src/main/mod.rs              |  14 +-
 src/test_utilities.rs        |  81 ++++++---
 tests/bedtools_validation.rs |  18 +-
 6 files changed, 265 insertions(+), 175 deletions(-)
 create mode 100644 scripts/seed_grinder.sh
diff --git a/Cargo.toml b/Cargo.toml
index 95bb0e6..aac5ff7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "granges"
-version = "0.2.1"
+version = "0.2.2"
 edition = "2021"
 license = "MIT"
 authors = ["Vince Buffalo <vsbuffalo@gmail.com>"]
@@ -12,7 +12,7 @@ description = "A Rust library and command line tool for genomic range operations
 
 [dependencies]
 # clap = { version = "4.4.18", features = ["derive"], optional = true }
-clap = { version = "4.4.18", features = ["derive"] }
+clap = { version = "4.4.18", features = ["derive", "wrap_help"] }
 coitrees = { version = "0.4.0", features = ["nosimd"] }
 flate2 = { version = "1.0.28", features = ["zlib-ng-compat"] }
 genomap = "0.2.6"
diff --git a/scripts/seed_grinder.sh b/scripts/seed_grinder.sh
new file mode 100644
index 0000000..6ee6eb1
--- /dev/null
+++ b/scripts/seed_grinder.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+START_SEED=1
+END_SEED=10000
+
+for SEED in $(seq $START_SEED $END_SEED); do
+  echo "Running tests with TEST_SEED=$SEED"
+  TEST_SEED=$SEED cargo test --test bedtools_validation
+  if [ $? -ne 0 ]; then
+    echo "Tests failed with TEST_SEED=$SEED"
+    exit 1
+  else
+    echo "Tests passed with TEST_SEED=$SEED"
+  fi
+done
+
diff --git a/src/commands.rs b/src/commands.rs
index f34c634..771135e 100644
--- a/src/commands.rs
+++ b/src/commands.rs
@@ -718,19 +718,29 @@ fn transpose<T>(v: Vec<Vec<T>>) -> Vec<Vec<T>> {
         .collect()
 }
 
-/// Histogram a set of features for a defined window size.
+/// Calculate the density of features in a BED4 file, per window. There are two 
+/// modes:
+///
+///   1. Without --exclusive (default): a basepair overlapping two features will
+///   increment the counts of both. 
+///
+///   2. With --exclusive: a basepair overlapping two features will be assigned to
+///      a new composite "feature set" of the two features, and increment the 
+///      count of that. In contrast to the default case, this means every overlapping
+///      feature in a window is assigned exclusively to one feature set.
 ///
 /// # Tips
 /// This is useful for a quick exploratory look at feature density
 /// by window, e.g. with
 ///
-///  $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \
+///  $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome 
 ///    hg38_seqlens.tsv --headers  | xsv table -d'\t' | less
 ///
 /// The xsv tool (https://github.com/BurntSushi/xsv) is useful for visualizing
 /// the results more clearly.
+///
 #[derive(Parser)]
-pub struct HistFeatures {
+pub struct FeatureDensity {
     /// A TSV genome file of chromosome names and their lengths
     #[arg(short, long, required = true)]
     genome: PathBuf,
@@ -751,8 +761,8 @@ pub struct HistFeatures {
     #[arg(short, long)]
     chop: bool,
 
-    /// Assign each basepair exclusively to a single "feature set" 
-    /// based on what features overlap it, and count the number of 
+    /// Assign each basepair exclusively to a single "feature set"
+    /// based on what features overlap it, and count the number of
     /// these per window. E.g. a region that overlaps "CDS" and "exon"
     /// will be called a "CDS,exon" feature. Columns are sorted
     /// in descending order by column sums. Headers will always be
@@ -773,66 +783,171 @@ pub struct HistFeatures {
     output: Option<PathBuf>,
 }
 
-impl HistFeatures {
-    pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
+type GRangesFeatureMatrix = GRanges<VecRangesIndexed, Vec<Vec<Position>>>;
+
+impl FeatureDensity {
+    /// Calculate feature density per window, with non-exclusive assignment 
+    /// of basepairs to features. E.g. a basepair that overlaps
+    /// "CDS" and "exon" features will be added to the tallies of both.
+    pub fn feature_density(&self) -> Result<(GRangesFeatureMatrix, Vec<String>), GRangesError> {
         let bedfile = &self.bedfile;
         let genome = read_seqlens(&self.genome)?;
         let bed4_iter = Bed4Iterator::new(bedfile)?;
 
-        if !self.exclusive {
-            // Split the elements in the iterator by feature into multiple GRanges objects.
-            let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> =
-                HashMap::new();
-            for result in bed4_iter {
-                let range = result?;
-                let feature = &range.data.name;
-                let vec = records_by_features.entry(feature.to_string()).or_default();
-                vec.push(range.into_empty()) // drop the feature, since we're hashing on it
-            }
+        // Split the elements in the iterator by feature into multiple GRanges objects.
+        let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> =
+            HashMap::new();
+        for result in bed4_iter {
+            let range = result?;
+            let feature = &range.data.name;
+            let vec = records_by_features.entry(feature.to_string()).or_default();
+            vec.push(range.into_empty()) // drop the feature, since we're hashing on it
+        }
+
+        // Load all the *merged* split records into memory as interval trees.
+        let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
+        for (feature, ranges) in records_by_features.into_iter() {
+            // doing a streaming merge of each feature; bookend merge
+            let merging_iter = MergingEmptyIterator::new(ranges, 0);
+
+            // load into memory and convert to interval trees
+            let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;
+
+            assert!(!gr_by_features.contains_key(&feature));
+            gr_by_features.insert(feature, gr);
+        }
+
+        // Now, create windows.
+        let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+
+        let features: Vec<_> = gr_by_features.keys().cloned().collect();
+        let mut feature_matrix = Vec::new();
+        for (_feature, gr) in gr_by_features.into_iter() {
+            // find the features that overlap each window
+            // TODO/OPTIMIZE: how costly is this clone?
+            let window_counts = windows
+                .clone()
+                .left_overlaps(&gr)?
+                .map_joins(|joins| {
+                    // these are merged, so this is the number of *unique* basepairs
+                    let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
+                    total_overlaps
+                })?
+            .take_data()?;
+            feature_matrix.push(window_counts);
+        }
 
-            // Load all the *merged* split records into memory as interval trees.
-            let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
-            for (feature, ranges) in records_by_features.into_iter() {
-                // doing a streaming merge of each feature; bookend merge
-                let merging_iter = MergingEmptyIterator::new(ranges, 0);
+        let feature_matrix = transpose(feature_matrix);
 
-                // load into memory and convert to interval trees
-                let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;
+        // Unite the windows with the feature matrix.
+        let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+        let windows = windows.into_granges_data(feature_matrix)?;
+        Ok((windows, features))
+    }
 
-                assert!(!gr_by_features.contains_key(&feature));
-                gr_by_features.insert(feature, gr);
+    /// Calculate feature density, but assign each basepair exclusively to a single 
+    /// "feature set", which is the unique set of features that a particular 
+    /// basepair overlaps.
+    pub fn feature_density_exclusive(&self) -> Result<(GRangesFeatureMatrix, Vec<String>), GRangesError> {
+        let bedfile = &self.bedfile;
+        let genome = read_seqlens(&self.genome)?;
+        let bed4_iter = Bed4Iterator::new(bedfile)?;
+
+        // Create GRanges of the feature indices.
+        // We do this manually to avoid creating a needless data container.
+        let mut gr = GRanges::new_vec_keyed(&genome);
+        for result in bed4_iter {
+            let range = result?;
+
+            // Note the trick here: we are *re-using* indices.
+            gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?;
+        }
+        let gr = gr.into_coitrees()?;
+
+        // clone the feature map
+        let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone();
+
+        // Create windows.
+        let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
+
+        // "Reduce" ranges and
+        let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| {
+            // "reduce" the ranges to a minimum spanning set that contains a vec of indices
+            let ranges = join.join.reduce_ranges();
+            let mut feature_overlaps: HashMap<Vec<usize>, _> = HashMap::new();
+            for range in ranges.iter() {
+                let mut key: Vec<usize> = range.indices().iter().map(|x| x.unwrap()).collect();
+                key.sort(); // sorted key is compound key
+                *feature_overlaps.entry(key).or_insert(0) += range.width();
             }
 
-            // Now, create windows.
-            let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
-
-            let features: Vec<_> = gr_by_features.keys().cloned().collect();
-            let mut feature_matrix = Vec::new();
-            for (_feature, gr) in gr_by_features.into_iter() {
-                // find the features that overlap each window
-                // TODO/OPTIMIZE: how costly is this clone?
-                let window_counts = windows
-                    .clone()
-                    .left_overlaps(&gr)?
-                    .map_joins(|joins| {
-                        // these are merged, so this is the number of *unique* basepairs
-                        let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
-                        total_overlaps
-                    })?
-                    .take_data()?;
-                feature_matrix.push(window_counts);
+            feature_overlaps
+        })?;
+
+        // Now, we go over the data and get the observed sets
+        let mut observed_sets = UniqueIdentifier::new();
+        let data = windows_overlaps
+            .data()
+            .ok_or(GRangesError::NoDataContainer)?;
+        for feature_counts in data.iter() {
+            for feature_set in feature_counts.keys() {
+                observed_sets.get_or_insert(feature_set);
+            }
+        }
+        let nsets = observed_sets.len(); // total feature sets
+
+        // Go over data again, making the sparse hashmap into a full
+        // matrix of overlappng basepairs.
+        let window_counts = windows_overlaps.map_data(|feature_counts| {
+            // fill out matrix
+            let mut counts = vec![0; nsets];
+            for (i, feature_set) in observed_sets.keys().enumerate() {
+                counts[i] = *feature_counts.get(feature_set).unwrap_or(&0);
             }
+            counts
+        })?;
+
+        // To make this prettier, let's sort by column totals (this
+        // is a little pricey; we can make option later if needed).
+        let matrix = window_counts.data().ok_or(GRangesError::NoDataContainer)?;
+        let col_totals = column_totals(matrix);
+        let sorted_indices = sorted_indices_by_values(&col_totals);
+
+        let window_counts = window_counts.map_data(|row| {
+            let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect();
+            row
+        })?;
+
+        // Create the labels.
+        let feature_sets: Vec<String> = observed_sets
+            .keys()
+            .map(|indices_key| {
+                let labels: Vec<_> = indices_key
+                    .iter()
+                    .map(|idx| feature_map.get_key(*idx).unwrap().clone())
+                    .collect();
+                labels.join(",")
+            })
+        .collect();
 
-            let feature_matrix = transpose(feature_matrix);
+        // Re-order the headers too by the sorted indices.
+        let feature_sets: Vec<_> = sorted_indices
+            .iter()
+            .map(|i| feature_sets[*i].clone())
+            .collect();
 
-            // Unite the windows with the feature matrix.
-            let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
-            let windows = windows.into_granges_data(feature_matrix)?;
+        Ok((window_counts, feature_sets))
+    }
+
+    /// Run this command given the command line interface.
+    pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
+        if !self.exclusive {
+            let (window_counts, features) = self.feature_density()?;
 
             // Write everything.
             if !self.headers {
                 // we have to *manually* add headers (serde needs flat structs otherwise)
-                windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
+                window_counts.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
             } else {
                 let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
                 headers.extend(features);
@@ -841,92 +956,10 @@ impl HistFeatures {
                     no_value_string: "NA".to_string(),
                     headers: Some(headers),
                 };
-                windows.write_to_tsv(self.output.as_ref(), &config)?;
+                window_counts.write_to_tsv(self.output.as_ref(), &config)?;
             }
         } else {
-            // This is done a bit differently than the non-classify case
-
-            // Create GRanges of the feature indices.
-            // We do this manually to avoid creating a needless data container.
-            let mut gr = GRanges::new_vec_keyed(&genome);
-            for result in bed4_iter {
-                let range = result?;
-
-                // Note the trick here: we are *re-using* indices.
-                gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?;
-            }
-            let gr = gr.into_coitrees()?;
-
-            // clone the feature map
-            let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone();
-
-            // Create windows.
-            let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
-
-            // "Reduce" ranges and
-            let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| {
-                // "reduce" the ranges to a minimum spanning set that contains a vec of indices
-                let ranges = join.join.reduce_ranges();
-                let mut feature_overlaps: HashMap<Vec<usize>, _> = HashMap::new();
-                for range in ranges.iter() {
-                    let mut key: Vec<usize> = range.indices().iter().map(|x| x.unwrap()).collect();
-                    key.sort(); // sorted key is compound key
-                    *feature_overlaps.entry(key).or_insert(0) += range.width();
-                }
-
-                feature_overlaps
-            })?;
-
-            // Now, we go over the data and get the observed sets
-            let mut observed_sets = UniqueIdentifier::new();
-            let data = windows_overlaps
-                .data()
-                .ok_or(GRangesError::NoDataContainer)?;
-            for feature_counts in data.iter() {
-                for feature_set in feature_counts.keys() {
-                    observed_sets.get_or_insert(feature_set);
-                }
-            }
-            let nsets = observed_sets.len(); // total feature sets
-
-            // Go over data again, making the sparse hashmap into a full
-            // matrix of overlappng basepairs.
-            let gr = windows_overlaps.map_data(|feature_counts| {
-                // fill out matrix
-                let mut counts = vec![0; nsets];
-                for (i, feature_set) in observed_sets.keys().enumerate() {
-                    counts[i] = *feature_counts.get(feature_set).unwrap_or(&0);
-                }
-                counts
-            })?;
-
-            // To make this prettier, let's sort by column totals (this
-            // is a little pricey; we can make option later if needed).
-            let matrix = gr.data().ok_or(GRangesError::NoDataContainer)?;
-            let col_totals = column_totals(matrix);
-            let sorted_indices = sorted_indices_by_values(&col_totals);
-
-            let gr = gr.map_data(|row| {
-                let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect();
-                row
-            })?;
-
-            // Create the labels.
-            let feature_sets: Vec<String> = observed_sets
-                .keys()
-                .map(|indices_key| {
-                    let labels: Vec<_> = indices_key
-                        .iter()
-                        .map(|idx| feature_map.get_key(*idx).unwrap().clone())
-                        .collect();
-                    labels.join(",")
-                })
-                .collect();
-
-            // Re-order the headers too by the sorted indices.
-            let feature_sets: Vec<_> = sorted_indices.iter()
-                .map(|i| feature_sets[*i].clone()).collect();
-
+            let (window_counts, feature_sets) = self.feature_density_exclusive()?;
             let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
             headers.extend(feature_sets);
 
@@ -934,14 +967,13 @@ impl HistFeatures {
                 no_value_string: "NA".to_string(),
                 headers: Some(headers),
             };
-            gr.write_to_tsv(self.output.as_ref(), &config)?;
+            window_counts.write_to_tsv(self.output.as_ref(), &config)?;
         }
         Ok(CommandOutput::new((), None))
     }
 }
 
-
-// get column totals 
+// get column totals
 fn column_totals(matrix: &Vec<Vec<Position>>) -> Vec<Position> {
     if matrix.is_empty() || matrix[0].is_empty() {
         return Vec::new();
@@ -959,7 +991,7 @@ fn column_totals(matrix: &Vec<Vec<Position>>) -> Vec<Position> {
     }
 
     totals
-} 
+}
 
 // get descending indices order
 fn sorted_indices_by_values(values: &[Position]) -> Vec<usize> {
@@ -967,4 +999,3 @@ fn sorted_indices_by_values(values: &[Position]) -> Vec<usize> {
     indices.sort_by_key(|&i| std::cmp::Reverse(values[i]));
     indices
 }
-
diff --git a/src/main/mod.rs b/src/main/mod.rs
index 70c6437..f63f0b8 100644
--- a/src/main/mod.rs
+++ b/src/main/mod.rs
@@ -4,7 +4,7 @@ use clap::{Parser, Subcommand};
 use granges::{
     commands::{
         granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, FilterChroms,
-        HistFeatures, Merge, ProcessingMode,
+        FeatureDensity, Merge, ProcessingMode,
     },
     data::operations::FloatOperation,
     prelude::GRangesError,
@@ -26,8 +26,12 @@ Subcommands:
                       overlap with a right range. This is equivalent to a filtering
                       "semi-join" in SQL terminology. 
 
-  hist-features:      Histogram feature coverage per windows. This merges overlapping
-                      ranges first.
+  feature-density     Calculate the density of features per window, e.g. how many 
+                      basepairs are "exon", "CDS", etc. With --exclusive, this will assign
+                      each basepair exclusively to a unique "feature set", such that
+                      if a basepair overlaps "CDS" and "exon" ranges, the overlapping
+                      number of basepairs will be added to a new composite "CDS,exon" 
+                      feature set.
 
   map:                Compute the left grouped overlaps between the left genomic ranges
                       and right genomic ranges, and apply one or more operations to the 
@@ -146,7 +150,7 @@ enum Commands {
         #[arg(long)]
         in_mem: bool,
     },
-    HistFeatures(HistFeatures),
+    FeatureDensity(FeatureDensity),
     /// Do a "left grouped join", on the specified left and right genomic ranges,
     /// and apply one or more functions to the BED5 scores for all right genomic
     /// ranges.
@@ -313,7 +317,7 @@ fn run() -> Result<(), GRangesError> {
             )
         }
         // NOTE: this is the new API, so clean!
-        Some(Commands::HistFeatures(hist_features)) => hist_features.run(),
+        Some(Commands::FeatureDensity(density)) => density.run(),
         Some(Commands::Merge(merge)) => merge.run(),
         Some(Commands::Windows {
             genome,
diff --git a/src/test_utilities.rs b/src/test_utilities.rs
index 38dcbec..0f9f70c 100644
--- a/src/test_utilities.rs
+++ b/src/test_utilities.rs
@@ -1,7 +1,13 @@
 //! Test cases and test utility functions.
 //!
 
-use std::{io::BufRead, path::PathBuf};
+use std::{
+    env,
+    fs::{self, copy},
+    io::BufRead,
+    path::PathBuf,
+};
+use tempfile::{Builder, NamedTempFile};
 
 use crate::{
     commands::granges_random_bed,
@@ -22,12 +28,16 @@ use genomap::GenomeMap;
 use indexmap::IndexMap;
 #[cfg(feature = "ndarray")]
 use ndarray::{Array1, Array2};
-use rand::{rngs::StdRng, distributions::Uniform, seq::SliceRandom, Rng, SeedableRng};
-
-// Set the seed for reproducible tests.
-const SEED: u64 = 4;
-
-use tempfile::{Builder, NamedTempFile};
+use rand::{distributions::Uniform, rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+
+// Use a default random seed, unless one is in the environment.
+fn get_rng() -> StdRng {
+    let seed: u64 = env::var("TEST_SEED")
+        .unwrap_or_else(|_| "13".to_string())
+        .parse()
+        .expect("TEST_SEED must be a valid u64");
+    StdRng::seed_from_u64(seed)
+}
 
 /// Get the path to the `grange` command line tool after a build.
 /// This is used for integration tests and benchmarks.
@@ -62,31 +72,29 @@ pub const MAX_CHROM_LEN: Position = 250_000_000;
 
 /// Build a random range start/end on a sequence of `max_len`.
 /// 0-indexed, right exclusive
-pub fn random_range(chrom_len: Position) -> (Position, Position) {
-    let mut rng = StdRng::seed_from_u64(SEED);
+pub fn random_range(chrom_len: Position, rng: &mut StdRng) -> (Position, Position) {
     let len = rng.gen_range(MIN_LEN..MAX_LEN);
     let start = rng.gen_range(0..chrom_len - len + 1);
     (start, start + len)
 }
 
 /// Build random sequence lengths
-pub fn random_seqlen() -> Position {
-    let mut rng = StdRng::seed_from_u64(SEED);
+pub fn random_seqlen(rng: &mut StdRng) -> Position {
     rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN)
 }
 
 /// Sample a random chromosome
-pub fn random_chrom() -> String {
-    let mut rng = StdRng::seed_from_u64(SEED);
+pub fn random_chrom(rng: &mut StdRng) -> String {
     format!("chr{}", rng.gen_range(1..NCHROM + 1))
 }
 
 /// Build random [`VecRanges`]
 pub fn random_vecranges(n: usize) -> VecRanges<RangeEmpty> {
-    let seqlen = random_seqlen();
+    let mut rng = get_rng();
+    let seqlen = random_seqlen(&mut rng);
     let mut vr = VecRanges::new(seqlen);
     for _i in 0..n {
-        let (start, end) = random_range(seqlen);
+        let (start, end) = random_range(seqlen, &mut rng);
         let range = RangeEmpty::new(start, end);
         vr.push_range(range);
     }
@@ -99,7 +107,7 @@ pub fn random_granges(
     seqlens: &IndexMap<String, Position>,
     num: usize,
 ) -> Result<GRangesEmpty<VecRangesEmpty>, GRangesError> {
-    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut rng = get_rng();
 
     let mut gr = GRangesEmpty::new_vec(seqlens);
 
@@ -109,15 +117,14 @@ pub fn random_granges(
         let chrom_len = *seqlens
             .get(seqname)
             .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?;
-        let (start, end) = random_range(chrom_len);
+        let (start, end) = random_range(chrom_len, &mut rng);
         gr.push_range(seqname, start, end)?;
     }
     Ok(gr)
 }
 
 /// Generate random strings, e.g. for mock feature names.
-fn generate_random_string(n: usize) -> String {
-    let mut rng = StdRng::seed_from_u64(SEED);
+fn generate_random_string(n: usize, rng: &mut StdRng) -> String {
     let letters: Vec<char> = ('a'..='z').collect();
     let letters_dist = Uniform::from(0..letters.len());
 
@@ -125,8 +132,7 @@ fn generate_random_string(n: usize) -> String {
 }
 
 /// Generate a random float value, e.g. for a mock BED "score".
-fn generate_random_uniform(start: f64, end: f64) -> f64 {
-    let mut rng = StdRng::seed_from_u64(SEED);
+fn generate_random_uniform(start: f64, end: f64, rng: &mut StdRng) -> f64 {
     let uniform = Uniform::new(start, end); // Specify the range
     rng.sample(uniform)
 }
@@ -137,7 +143,7 @@ pub fn random_granges_mock_bed5(
     seqlens: &IndexMap<String, Position>,
     num: usize,
 ) -> Result<GRanges<VecRangesIndexed, Vec<Bed5Addition>>, GRangesError> {
-    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut rng = get_rng();
 
     let mut gr = GRanges::new_vec(seqlens);
 
@@ -147,10 +153,10 @@ pub fn random_granges_mock_bed5(
         let chrom_len = *seqlens
             .get(seqname)
             .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?;
-        let (start, end) = random_range(chrom_len);
+        let (start, end) = random_range(chrom_len, &mut rng);
         let bed5_cols = Bed5Addition {
-            name: generate_random_string(8),
-            score: Some(generate_random_uniform(0.0, 1.0)),
+            name: generate_random_string(8, &mut rng),
+            score: Some(generate_random_uniform(0.0, 1.0, &mut rng)),
         };
         gr.push_range(seqname, start, end, bed5_cols)?;
     }
@@ -166,10 +172,31 @@ pub fn random_coitrees() -> COITrees<()> {
 
 /// Get a temporary file with a .bed suffix.
 pub fn temp_bedfile() -> NamedTempFile {
-    Builder::new()
+    let tempfile = Builder::new()
         .suffix(".bed")
         .tempfile()
-        .expect("Failed to create temp file")
+        .expect("Failed to create temp file");
+    tempfile
+}
+
+/// Copy a temp file for inspection, allowing the specification of a custom temporary directory name.
+// AICODE -- co-written with AI
+pub fn copy_tempfile_for_inspection(tempfile: impl Into<PathBuf>, new_file_name: &str) -> PathBuf {
+    let tempfile = tempfile.into();
+    // Check if the TEST_LOCAL environment variable is set
+    if env::var("TEST_LOCAL").is_ok() {
+        let dir_path = PathBuf::from("test_temp");
+        fs::create_dir_all(&dir_path).expect("could not create test_temp/");
+
+        let dest_path = dir_path.join(new_file_name);
+
+        copy(tempfile, &dest_path).expect("could not copy temp file");
+
+        dest_path
+    } else {
+        // If TEST_LOCAL is not set, simply return the path of the original temp file
+        tempfile
+    }
 }
 
 /// Create a random BED3 file based on the hg38 sequence lengths, and write to disk.
diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs
index 09dc639..67b0f0a 100644
--- a/tests/bedtools_validation.rs
+++ b/tests/bedtools_validation.rs
@@ -93,7 +93,7 @@ fn validate_bedfloats(
 // adjustable test sizes -- useful also for making very small
 // so string diffs are easier to compare.
 #[cfg(not(feature = "bench-big"))]
-const BED_LENGTH: usize = 10;
+const BED_LENGTH: usize = 100_000;
 #[cfg(feature = "bench-big")]
 const BED_LENGTH: usize = 1_000_000;
 
@@ -322,6 +322,10 @@ fn test_against_bedtools_makewindows() {
     let widths = vec![131123, 1_000_0013];
     let steps = vec![10_001, 10_113];
 
+    // smaller test set:
+    // let widths = vec![1_000_0013];
+    // let steps = vec![10000];
+
     for width in widths.iter() {
         for step in steps.iter() {
             let bedtools_output = Command::new("bedtools")
@@ -441,8 +445,8 @@ fn test_against_bedtools_map_multiple() {
     // try multiple operations at once
     let num_ranges = BED_LENGTH;
     let width = 1_000_000;
-    #[allow(unused_variables)]
-    let step = 10_000; // can uncomment lines below to test this
+    // #[allow(unused_variables)]
+    // let step = 10_000; // can uncomment lines below to test this
 
     // make windows
     let windows_file = temp_bedfile();
@@ -464,8 +468,11 @@ fn test_against_bedtools_map_multiple() {
         granges_windows_output
     );
 
+    // copy_tempfile_for_inspection(&windows_file.path(), "windows.bed");
+
     // create the random data BED5
     let bedscores_file = random_bed5file(num_ranges);
+    // copy_tempfile_for_inspection(&bedscores_file.path(), "scores.bed");
 
     let bedtools_path = temp_bedfile();
     let bedtools_output_file = File::create(&bedtools_path).unwrap();
@@ -485,6 +492,8 @@ fn test_against_bedtools_map_multiple() {
         .output()
         .expect("bedtools map failed");
 
+    // copy_tempfile_for_inspection(&bedtools_path.path(), "bedtools.bed");
+
     let granges_output_file = temp_bedfile();
     let granges_output = Command::new(granges_binary_path())
         .arg("map")
@@ -501,6 +510,8 @@ fn test_against_bedtools_map_multiple() {
         .output()
         .expect("granges map failed");
 
+    // copy_tempfile_for_inspection(&granges_output_file.path(), "granges.bed");
+
     assert!(bedtools_output.status.success(), "{:?}", bedtools_output);
     assert!(granges_output.status.success(), "{:?}", granges_output);
 
@@ -539,6 +550,7 @@ fn test_against_bedtools_map_multiple() {
         .iter()
         .zip(bedtools_data.iter())
         .for_each(|(gr, bd)| {
+            // dbg!((gr.min, bd.min));
             assert_option_float_tol!(gr.min, bd.min);
             assert_option_float_tol!(gr.max, bd.max);