From ba880b691620126aa1178ad7d77e5a3aa516f8b3 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Sat, 2 Mar 2024 23:08:42 -0800
Subject: [PATCH] New merging iterators, cleaned up BED parsing, new `merge`
 command.

 - Merging iterator prototypes added. These are streaming iterators
   of sorted input that are merged based on distance or overlap.
 - Much cleaner BED variant detection, and `Bed4Iterator` method.
 - Entire organization of parsing modules.
 - New detection methods that try to use serde deserialization.
 - The new `merge` command is structured differently than other
   commands as part of a new cleaner way to build up subcommands
   in code.
 - New validation and benchmarks of `granges merge`.
 - New `granges_test_case_03()` and new test data.
 - New test utilities, e.g. `validate_bedfloats()` for validating
   agains bedtools map, etc functions.
 - Bedlike now based on csv + serde.
 - New `granges filter-chrom` command.
---
 Cargo.toml                     |   4 +-
 benches/bedtools_comparison.rs |  43 +++
 src/commands.rs                | 194 ++++++++++--
 src/data/operations.rs         |  24 +-
 src/error.rs                   |   3 +
 src/granges.rs                 |  56 +++-
 src/io/mod.rs                  |   3 +-
 src/io/parsers/bed.rs          | 284 -----------------
 src/io/parsers/bed/bed3.rs     |  26 ++
 src/io/parsers/bed/bed4.rs     |  46 +++
 src/io/parsers/bed/bed5.rs     |  52 +++
 src/io/parsers/bed/bedlike.rs  | 161 ++++++++++
 src/io/parsers/bed/mod.rs      |  99 ++++++
 src/io/parsers/detect.rs       | 181 +++++++++++
 src/io/parsers/filters.rs      | 268 ++++++++++++++++
 src/io/parsers/mod.rs          | 564 +--------------------------------
 src/io/parsers/tsv.rs          |  52 ++-
 src/io/parsers/utils.rs        |  85 +++++
 src/iterators.rs               | 106 ++++++-
 src/lib.rs                     |  16 +-
 src/main/mod.rs                |  16 +-
 src/merging_iterators.rs       | 475 +++++++++++++++++++++++++++
 src/ranges/mod.rs              |  42 +++
 src/reporting.rs               |   4 +-
 src/sequences/nucleotide.rs    |   1 -
 src/test_utilities.rs          |  77 ++++-
 src/traits.rs                  |  68 +++-
 tests/bedtools_validation.rs   | 253 +++++++++++----
 tests_data/example_bedlike.tsv |  10 +-
 tests_data/test_case_01.bed    |   5 +
 tests_data/test_case_03.bed    |   5 +
 31 files changed, 2230 insertions(+), 993 deletions(-)
 delete mode 100644 src/io/parsers/bed.rs
 create mode 100644 src/io/parsers/bed/bed3.rs
 create mode 100644 src/io/parsers/bed/bed4.rs
 create mode 100644 src/io/parsers/bed/bed5.rs
 create mode 100644 src/io/parsers/bed/bedlike.rs
 create mode 100644 src/io/parsers/bed/mod.rs
 create mode 100644 src/io/parsers/detect.rs
 create mode 100644 src/io/parsers/filters.rs
 create mode 100644 src/merging_iterators.rs
 create mode 100644 tests_data/test_case_01.bed
 create mode 100644 tests_data/test_case_03.bed

diff --git a/Cargo.toml b/Cargo.toml
index 1f14e55..95bb0e6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "granges"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 license = "MIT"
 authors = ["Vince Buffalo <vsbuffalo@gmail.com>"]
@@ -14,7 +14,7 @@ description = "A Rust library and command line tool for genomic range operations
 # clap = { version = "4.4.18", features = ["derive"], optional = true }
 clap = { version = "4.4.18", features = ["derive"] }
 coitrees = { version = "0.4.0", features = ["nosimd"] }
-flate2 = "1.0.28"
+flate2 = { version = "1.0.28", features = ["zlib-ng-compat"] }
 genomap = "0.2.6"
 indexmap = "2.2.3"
 ndarray = { version = "0.15.6", optional = true}
diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs
index 723a3d0..b6eee58 100644
--- a/benches/bedtools_comparison.rs
+++ b/benches/bedtools_comparison.rs
@@ -357,8 +357,51 @@ fn bench_map_all_operations(c: &mut Criterion) {
     });
 }
 
+fn bench_merge_empty(c: &mut Criterion) {
+    // create the benchmark group
+    let mut group = c.benchmark_group("merge_empty");
+
+    // create the test data
+    let input_bedfile = random_bed3file(BED_LENGTH);
+
+    // the distance
+    let distance = 100;
+
+    // configure the sample size for the group
+    group.sample_size(10);
+
+    group.bench_function("bedtools_merge", |b| {
+        b.iter(|| {
+            let bedtools_output = Command::new("bedtools")
+                .arg("merge")
+                .arg("-i")
+                .arg(input_bedfile.path())
+                .arg("-d")
+                .arg(distance.to_string())
+                .output()
+                .expect("bedtools merge failed");
+            assert!(bedtools_output.status.success());
+        });
+    });
+
+    group.bench_function("granges_merge", |b| {
+        b.iter(|| {
+            let granges_output = Command::new(granges_binary_path())
+                .arg("merge")
+                .arg("--bedfile")
+                .arg(input_bedfile.path())
+                .arg("-d")
+                .arg(distance.to_string())
+                .output()
+                .expect("bedtools merge failed");
+            assert!(granges_output.status.success());
+        });
+    });
+}
+
 criterion_group!(
     benches,
+    bench_merge_empty,
     bench_filter_adjustment,
     bench_range_adjustment,
     bench_flank,
diff --git a/src/commands.rs b/src/commands.rs
index 899435b..2e5a036 100644
--- a/src/commands.rs
+++ b/src/commands.rs
@@ -1,15 +1,18 @@
 //! Command functions that implement each of the `granges` subcommands.
+//!
+// TODO: these functions should be methods of the input struct.
 
-use std::{fs::File, io, path::PathBuf};
-
+use clap::Parser;
 use csv::WriterBuilder;
+use std::{fs::File, io, path::PathBuf};
 
 use crate::{
-    data::{operations::Operation, SerializableDatumType},
+    data::{operations::FloatOperation, SerializableDatumType},
     io::{
         parsers::{Bed5Iterator, GenomicRangesParser},
         tsv::BED_TSV,
     },
+    merging_iterators::{MergingEmptyResultIterator, MergingResultIterator},
     prelude::*,
     ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty},
     reporting::{CommandOutput, Report},
@@ -111,6 +114,11 @@ pub fn granges_adjust(
                 gr.adjust_ranges(-both, both)
                     .write_to_tsv(output, &BED_TSV)?
             }
+            GenomicRangesParser::Bed4(iter) => {
+                let gr = GRanges::from_iter(iter, &genome)?;
+                gr.adjust_ranges(-both, both)
+                    .write_to_tsv(output, &BED_TSV)?
+            }
             GenomicRangesParser::Bed5(iter) => {
                 let gr = GRanges::from_iter(iter, &genome)?;
                 gr.adjust_ranges(-both, both)
@@ -130,7 +138,7 @@ pub fn granges_adjust(
             }
         }
     }
-    Ok(CommandOutput::new((), report))
+    Ok(CommandOutput::new((), Some(report)))
 }
 
 /// Filters genomic ranges based on overlaps with another set of ranges.
@@ -168,9 +176,6 @@ pub fn granges_filter(
     let left_iter = GenomicRangesFile::parsing_iterator(left_path)?;
     let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;
 
-    // for reporting stuff to the user
-    let report = Report::new();
-
     match (left_iter, right_iter) {
         (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => {
             let left_gr;
@@ -189,7 +194,7 @@ pub fn granges_filter(
             let semijoin = left_gr.filter_overlaps(&right_gr)?;
             semijoin.write_to_tsv(output, &BED_TSV)?;
 
-            Ok(CommandOutput::new((), report))
+            Ok(CommandOutput::new((), None))
         }
         (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => {
             let left_gr;
@@ -211,7 +216,7 @@ pub fn granges_filter(
             let semijoin = left_gr.filter_overlaps(&right_gr)?;
             semijoin.write_to_tsv(output, &BED_TSV)?;
 
-            Ok(CommandOutput::new((), report))
+            Ok(CommandOutput::new((), None))
         }
         (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => {
             let left_gr;
@@ -231,7 +236,7 @@ pub fn granges_filter(
             let semijoin = left_gr.filter_overlaps(&right_gr)?;
             semijoin.write_to_tsv(output, &BED_TSV)?;
 
-            Ok(CommandOutput::new((), report))
+            Ok(CommandOutput::new((), None))
         }
         (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => {
             let left_gr;
@@ -254,7 +259,7 @@ pub fn granges_filter(
             let intersection = left_gr.filter_overlaps(&right_gr)?;
             intersection.write_to_tsv(output, &BED_TSV)?;
 
-            Ok(CommandOutput::new((), report))
+            Ok(CommandOutput::new((), None))
         }
         _ => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
     }
@@ -297,8 +302,6 @@ pub fn granges_flank(
     let seqnames: Vec<String> = genome.keys().cloned().collect();
     let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;
 
-    let report = Report::new();
-
     match mode {
         // Note: this is kept for benchmarking, to see how costly building GRanges
         // objects is versus using streaming.
@@ -312,6 +315,16 @@ pub fn granges_flank(
                 gr.flanking_ranges(left, right)?
                     .write_to_tsv(output, &BED_TSV)?
             }
+            GenomicRangesParser::Bed4(iter) => {
+                let gr = if skip_missing {
+                    GRanges::from_iter(iter.retain_seqnames(&seqnames), &genome)?
+                } else {
+                    GRanges::from_iter(iter, &genome)?
+                };
+                gr.flanking_ranges(left, right)?
+                    .write_to_tsv(output, &BED_TSV)?
+            }
+
             GenomicRangesParser::Bed5(_iter) => {
                 unimplemented!()
             }
@@ -372,6 +385,9 @@ pub fn granges_flank(
                         }
                     }
                 }
+                GenomicRangesParser::Bed4(_iter) => {
+                    unimplemented!()
+                }
                 GenomicRangesParser::Bed5(_iter) => {
                     unimplemented!()
                 }
@@ -412,7 +428,7 @@ pub fn granges_flank(
             }
         }
     }
-    Ok(CommandOutput::new((), report))
+    Ok(CommandOutput::new((), None))
 }
 
 /// # Developer Notes
@@ -421,13 +437,12 @@ pub fn granges_map(
     seqlens: impl Into<PathBuf>,
     left_path: &PathBuf,
     right_path: &PathBuf,
-    operations: Vec<Operation>,
+    operations: Vec<FloatOperation>,
     output: Option<&PathBuf>,
     skip_missing: bool,
 ) -> Result<CommandOutput<()>, GRangesError> {
     let genome = read_seqlens(seqlens)?;
     let seqnames: Vec<String> = genome.keys().cloned().collect();
-    let report = Report::new();
 
     let left_iter = Bed3Iterator::new(left_path)?;
     let right_iter = Bed5Iterator::new(right_path)?;
@@ -487,7 +502,7 @@ pub fn granges_map(
 
     result_gr.write_to_tsv(output, &BED_TSV)?;
 
-    Ok(CommandOutput::new((), report))
+    Ok(CommandOutput::new((), None))
 }
 
 /// Generate a BED3 file of genomic windows.
@@ -500,8 +515,7 @@ pub fn granges_windows(
 ) -> Result<CommandOutput<()>, GRangesError> {
     let genome = read_seqlens(seqlens)?;
     GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?;
-    let report = Report::new();
-    Ok(CommandOutput::new((), report))
+    Ok(CommandOutput::new((), None))
 }
 
 /// Generate a random BED-like file with genomic ranges.
@@ -529,6 +543,144 @@ pub fn granges_random_bed(
         gr.write_to_tsv(output, &BED_TSV)?;
     };
 
-    let report = Report::new();
-    Ok(CommandOutput::new((), report))
+    Ok(CommandOutput::new((), None))
+}
+
+/// Merges all the genomic ranges if they overlap by `distance`.
+#[derive(Parser)]
+pub struct Merge {
+    /// The input BED-like TSV file to merge.
+    #[arg(short, long, required = true)]
+    bedfile: PathBuf,
+
+    /// The minimum distance at which to merge ranges. Like `bedtools merge`,
+    /// `--distance 0` will merge "book-ended" ranges. Negative numbers
+    /// will only merge ranges that overlap by that degree of overlap.
+    #[clap(short, long, default_value_t = 0)]
+    distance: PositionOffset,
+
+    ///// Whether to "group by" feature name, i.e. overlapping ranges
+    ///// with different feature names will not be merged.
+    //#[clap(short, long)]
+    //group_features: usize,
+    /// Operation to do to summarize the score column.
+    #[clap(short, long, value_parser = clap::value_parser!(FloatOperation))]
+    func: Option<FloatOperation>,
+
+    /// An optional output file (standard output will be used if not specified)
+    #[arg(short, long)]
+    output: Option<PathBuf>,
+}
+
+impl Merge {
+    // TODO optional genome file for validation?
+    pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
+        let bedfile = &self.bedfile;
+        let distance = &self.distance;
+        let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;
+        let func = &self.func;
+
+        let writer_boxed: Box<dyn io::Write> = match &self.output {
+            Some(path) => Box::new(File::create(path)?),
+            None => Box::new(io::stdout()),
+        };
+        let mut writer = WriterBuilder::new()
+            .delimiter(b'\t')
+            .has_headers(false)
+            .from_writer(writer_boxed);
+
+        match ranges_iter {
+            GenomicRangesParser::Bed3(iter) => {
+                let merging_iter = MergingEmptyResultIterator::new(iter, *distance);
+                for result in merging_iter {
+                    let record = result?;
+                    writer.serialize(record)?;
+                }
+                Ok(CommandOutput::new((), None))
+            }
+            GenomicRangesParser::Bed4(iter) => {
+                let merging_iter = MergingResultIterator::new(iter, *distance, |data| {
+                    data.into_iter()
+                        .map(|x| x.name)
+                        .collect::<Vec<_>>()
+                        .join(",")
+                });
+                for result in merging_iter {
+                    let record = result?;
+                    writer.serialize(record)?;
+                }
+                Ok(CommandOutput::new((), None))
+            }
+            GenomicRangesParser::Bed5(iter) => {
+                // merging iterator, where we extract scores and apply an operation to all merged genomic ranges' scores
+                let merging_iter = MergingResultIterator::new(iter, *distance, |data| {
+                    let mut scores: Vec<f64> = data
+                        .into_iter()
+                        .filter_map(|bed5_cols| bed5_cols.score)
+                        .collect();
+                    // this unwrap is safe -- if func is None, we use Bed3
+                    func.as_ref().unwrap().run(&mut scores)
+                });
+
+                for result in merging_iter {
+                    let record = result?;
+                    writer.serialize(record)?;
+                }
+                Ok(CommandOutput::new((), None))
+            }
+            GenomicRangesParser::Bedlike(_iter) => {
+                todo!()
+            }
+            GenomicRangesParser::Unsupported => {
+                Err(GRangesError::UnsupportedGenomicRangesFileFormat)
+            }
+        }
+    }
+}
+
+/// Filter out ranges not in the specified "genome" file.
+#[derive(Parser)]
+pub struct FilterChroms {
+    /// A TSV genome file of chromosome names and their lengths
+    #[arg(short, long, required = true)]
+    genome: PathBuf,
+
+    /// The input BED file.
+    #[arg(short, long, required = true)]
+    bedfile: PathBuf,
+
+    /// An optional output file (standard output will be used if not specified)
+    #[arg(short, long)]
+    output: Option<PathBuf>,
+}
+
+impl FilterChroms {
+    pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
+        let bedfile = &self.bedfile;
+        let genome = read_seqlens(&self.genome)?;
+
+        let writer_boxed: Box<dyn io::Write> = match &self.output {
+            Some(path) => Box::new(File::create(path)?),
+            None => Box::new(io::stdout()),
+        };
+
+        let mut writer = WriterBuilder::new()
+            .delimiter(b'\t')
+            .has_headers(false)
+            .from_writer(writer_boxed);
+
+        let bedlike_iterator = BedlikeIterator::new(bedfile)?;
+
+        // If we don't need to sort, use iterator-based streaming processing.
+        for record in bedlike_iterator {
+            let range = record?;
+            let seqname = &range.seqname;
+            let passes_filter = genome.contains_key(seqname);
+            if passes_filter {
+                writer.serialize(range)?;
+            }
+        }
+
+        Ok(CommandOutput::new((), None))
+    }
 }
diff --git a/src/data/operations.rs b/src/data/operations.rs
index 05245a2..d2feffd 100644
--- a/src/data/operations.rs
+++ b/src/data/operations.rs
@@ -33,7 +33,7 @@ pub fn median<F: Float + Sum>(numbers: &mut [F]) -> Option<F> {
 
 /// The (subset of) standard `bedtools map` operations.
 #[derive(Clone, Debug, ValueEnum)]
-pub enum Operation {
+pub enum FloatOperation {
     /// Calculate the sum of all values (a set of zero elements has sum 0.0).
     Sum,
     /// Calculate the sum of all values, but set of zero elements is a missing value, not 0.0.
@@ -50,25 +50,25 @@ pub enum Operation {
     Collapse,
 }
 
-impl Operation {
+impl FloatOperation {
     #[inline(always)]
     pub fn run<T: IntoDatumType + Copy>(&self, data: &mut [T]) -> DatumType
     where
         T: Float + Sum<T> + ToPrimitive + Clone + ToString,
     {
         match self {
-            Operation::Sum => {
+            FloatOperation::Sum => {
                 let sum: T = data.iter().copied().sum();
                 sum.into_data_type()
             }
-            Operation::SumNotEmpty => {
+            FloatOperation::SumNotEmpty => {
                 if data.is_empty() {
                     return DatumType::NoValue;
                 }
                 let sum: T = data.iter().copied().sum();
                 sum.into_data_type()
             }
-            Operation::Min => {
+            FloatOperation::Min => {
                 let min = data
                     .iter()
                     .filter(|x| x.is_finite())
@@ -76,7 +76,7 @@ impl Operation {
                     .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Greater));
                 min.map_or(DatumType::NoValue, |x| x.into_data_type())
             }
-            Operation::Max => {
+            FloatOperation::Max => {
                 let max = data
                     .iter()
                     .filter(|x| x.is_finite())
@@ -84,7 +84,7 @@ impl Operation {
                     .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Less));
                 max.map_or(DatumType::NoValue, |x| x.into_data_type())
             }
-            Operation::Mean => {
+            FloatOperation::Mean => {
                 if data.is_empty() {
                     DatumType::NoValue
                 } else {
@@ -93,8 +93,10 @@ impl Operation {
                     mean.into_data_type()
                 }
             }
-            Operation::Median => median(data).map_or(DatumType::NoValue, |x| x.into_data_type()),
-            Operation::Collapse => {
+            FloatOperation::Median => {
+                median(data).map_or(DatumType::NoValue, |x| x.into_data_type())
+            }
+            FloatOperation::Collapse => {
                 let collapsed = data
                     .iter()
                     .map(|num| num.to_string())
@@ -106,6 +108,10 @@ impl Operation {
     }
 }
 
+pub enum StringOperation {
+    Collapse,
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/error.rs b/src/error.rs
index 3204519..7211d2e 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -75,6 +75,9 @@ pub enum GRangesError {
     #[error("File reading error: {0}. Please check if the file exists and you have permission to read it.")]
     IOError(#[from] std::io::Error),
 
+    #[error("The specified file '{0}' is empty.")]
+    EmptyFile(String),
+
     #[error("File parsing error: {0}")]
     TsvParsingError(#[from] csv::Error),
 
diff --git a/src/granges.rs b/src/granges.rs
index 62d4048..4dcd600 100644
--- a/src/granges.rs
+++ b/src/granges.rs
@@ -47,7 +47,7 @@ use serde::Serialize;
 use crate::{
     ensure_eq,
     io::tsv::TsvConfig,
-    iterators::GRangesIterator,
+    iterators::{GRangesIterator, GRangesRecordIterator},
     join::{
         CombinedJoinData, CombinedJoinDataBothEmpty, CombinedJoinDataLeftEmpty,
         CombinedJoinDataRightEmpty, JoinData, JoinDataBothEmpty, JoinDataLeftEmpty,
@@ -1315,6 +1315,39 @@ where
         Ok(gr)
     }
 
+    ///// Retain the *intersection* of each left genomic ranges that has at least one
+    ///// overlap with the `right` set of genomic ranges.
+    /////
+    ///// This will retain only the overlapping portion
+    /////
+    ///// This is a type of *filtering join*, in particular a *semi-join*.
+    ///// See Hadley Wickham's [R for Data
+    ///// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information.
+    //pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>(
+    //    self,
+    //    // right: &GRanges<COITrees<M>, DR>,
+    //    right: &'a impl AsGRangesRef<'a, COITrees<M>, DR>,
+    //) -> Result<GRangesEmpty<VecRangesEmpty>, GRangesError> {
+    //    let mut gr = GRangesEmpty::new_vec(&self.seqlens());
+
+    //    let right_ref = right.as_granges_ref();
+
+    //    for (seqname, left_ranges) in self.0.ranges.iter() {
+    //        for left_range in left_ranges.iter_ranges() {
+    //            if let Some(right_ranges) = right_ref.ranges.get(seqname) {
+    //                let num_overlaps =
+    //                    right_ranges.count_overlaps(left_range.start(), left_range.end());
+    //                if num_overlaps == 0 {
+    //                    // no overlaps -- skip
+    //                } else {
+    //                    gr.push_range(seqname, left_range.start(), left_range.end())?;
+    //                }
+    //            }
+    //        }
+    //    }
+    //    Ok(gr)
+    //}
+
     /// Retain only genomic ranges that have at least one overlap with the `right`
     /// set of genomic ranges. The whole range will be retained.
     ///
@@ -1471,6 +1504,17 @@ where
     }
 }
 
+impl<R, T> GRanges<R, T>
+where
+    R: IterableRangeContainer,
+    T: IndexedDataContainer,
+{
+    /// Create a new [`GRangesRecordIterator`] to iterate through all the ranges in this [`GRanges`] object.
+    pub fn iter_records(&self) -> GRangesRecordIterator<'_, R, T> {
+        GRangesRecordIterator::new(self)
+    }
+}
+
 /// [`PartialEq`] for [`GRanges`] objects.
 ///
 /// This is a more powerful comparison operator than [`GRanges.is_equal_to()`], since it will first
@@ -1512,6 +1556,7 @@ where
 #[cfg(test)]
 mod tests {
     use crate::{
+        iterators::GRangesRecordIterator,
         prelude::*,
         test_utilities::{granges_test_case_01, granges_test_case_02, random_vecranges},
         Position,
@@ -1901,4 +1946,13 @@ mod tests {
         assert_eq!(first_chrom.0, "chr1");
         assert_eq!(*first_chrom.1, vec![2, 5, 13]);
     }
+
+    #[test]
+    fn test_grange_iter_records() {
+        // main test is in iterators.rs module.
+        let gr = granges_test_case_01();
+        let grr_iter = GRangesRecordIterator::new(&gr);
+        let iter = gr.iter_records();
+        assert!(grr_iter.zip(iter).all(|(a, b)| a == b));
+    }
 }
diff --git a/src/io/mod.rs b/src/io/mod.rs
index d804f08..d33c373 100644
--- a/src/io/mod.rs
+++ b/src/io/mod.rs
@@ -6,7 +6,8 @@ pub mod tsv;
 
 pub use file::{InputStream, OutputStream};
 pub use parsers::{
-    bed::Bed3Iterator, bed::Bed5Iterator, bed::BedlikeIterator, tsv::TsvRecordIterator,
+    bed::{Bed3Iterator, Bed4Iterator, Bed5Iterator, BedlikeIterator},
+    tsv::TsvRecordIterator,
     GenomicRangesFile, GenomicRangesParser,
 };
 pub use tsv::{TsvConfig, BED_TSV};
diff --git a/src/io/parsers/bed.rs b/src/io/parsers/bed.rs
deleted file mode 100644
index d1811e3..0000000
--- a/src/io/parsers/bed.rs
+++ /dev/null
@@ -1,284 +0,0 @@
-//! BED Types and Functionality
-//!
-//! The BED (Browser Extensible Format) is a TSV format in bioinformatics.
-//! It has a fairly strict [specification](https://samtools.github.io/hts-specs/BEDv1.pdf),
-//! but in practice it is quite permissive, and in bioinformatics one encounters lots
-//! of "BED-like" files.
-//!
-//! # Design
-//!
-//! Since BED files can be thought of BED3 + an optional *addition*, the type
-//! returned by these parsing iterators is [`GenomicRangeRecord<U>`] where
-//! `U` is some sort of addition type like [`Bed5Addition`].
-//!
-//! # ⚠️ Stability
-//!
-//! This module defines core BED types, but is under active development.
-//!
-
-use serde::de::Error as DeError;
-use serde::{Deserialize, Deserializer, Serialize};
-use std::io::{BufRead, BufReader};
-use std::path::PathBuf;
-use std::str::FromStr;
-
-use crate::error::GRangesError;
-use crate::io::InputStream;
-use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty};
-use crate::Position;
-
-use super::parse_column;
-use super::tsv::TsvRecordIterator;
-
-// for BedlikeIterator only
-pub const PARSE_CAPACITY: usize = 512;
-
-/// [`serde`] deserializer for a BED column with a possibly missing value. Note that the [BED
-/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf) only technically allows `'.'` to
-/// be used for missing strands, but in practice it can be found to represent
-/// missing scores, etc too.
-pub fn bed_missing<'de, D, T>(deserializer: D) -> Result<Option<T>, D::Error>
-where
-    D: Deserializer<'de>,
-    T: Deserialize<'de> + FromStr,
-    <T as FromStr>::Err: std::fmt::Display,
-{
-    let missing_chars = &["."];
-    deserialize_option_generic(deserializer, missing_chars) // Use the generic deserializer with specific placeholders
-}
-
-/// The additional two BED5 columns.
-///
-/// # Fields
-/// * `name`: the feature name.
-/// * `score`: a score.
-// TODO/RENAME: maybe since this goes against spec it should
-// be called Bed5AdditionPermissive?
-#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
-#[serde(deny_unknown_fields)]
-pub struct Bed5Addition {
-    pub name: String,
-    #[serde(deserialize_with = "bed_missing")]
-    pub score: Option<f64>,
-}
-
-/// An iterator over BED5 entries, which contain the three
-/// range entries (sequence name, start and end positions),
-/// a feature name, and a score.
-///
-/// Note that the [`Bed5Addition`] is *permissive* in the sense
-/// it allows for more input types than the official [BED format
-/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf).
-// TODO strict type?
-#[derive(Debug)]
-pub struct Bed5Iterator {
-    iter: TsvRecordIterator<GenomicRangeRecord<Bed5Addition>>,
-}
-
-impl Bed5Iterator {
-    /// Creates a parsing iterator over a BED5 file.
-    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
-        let iter = TsvRecordIterator::new(filepath)?;
-
-        Ok(Self { iter })
-    }
-}
-
-impl Iterator for Bed5Iterator {
-    type Item = Result<GenomicRangeRecord<Bed5Addition>, GRangesError>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.iter.next()
-    }
-}
-
-/// An iterator over BED3 entries (which just contain ranges no data).
-#[derive(Debug)]
-pub struct Bed3Iterator {
-    iter: TsvRecordIterator<GenomicRangeRecordEmpty>,
-}
-
-impl Bed3Iterator {
-    /// Creates a parsing iterator over a BED5 file.
-    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
-        let iter = TsvRecordIterator::new(filepath)?;
-        Ok(Self { iter })
-    }
-}
-
-impl Iterator for Bed3Iterator {
-    type Item = Result<GenomicRangeRecordEmpty, GRangesError>;
-    fn next(&mut self) -> Option<Self::Item> {
-        self.iter.next()
-    }
-}
-
-/// Nucleotide strand enum type.
-#[derive(Clone, Debug)]
-pub enum Strand {
-    Forward,
-    Reverse,
-}
-
-/// Deserializes some value of type `t` with some possible missing
-/// character `missing_chars` into [`Option<T>`].
-pub fn deserialize_option_generic<'de, D, T>(
-    deserializer: D,
-    missing_chars: &'de [&'de str],
-) -> Result<Option<T>, D::Error>
-where
-    D: Deserializer<'de>,
-    T: Deserialize<'de> + FromStr,
-    <T as FromStr>::Err: std::fmt::Display,
-{
-    let s: String = Deserialize::deserialize(deserializer)?;
-    if missing_chars.contains(&s.as_str()) {
-        Ok(None)
-    } else {
-        s.parse::<T>()
-            .map(Some)
-            .map_err(|e| DeError::custom(format!("parsing error: {}", e)))
-    }
-}
-
-//impl Selection for &Bed5Addition {
-//    fn select_by_name(&self, name: &str) -> DatumType {
-//        match name {
-//            "name" => DatumType::String(self.name.clone()),
-//            "score" => DatumType::Float64(self.score),
-//            _ => panic!("No item named '{}'", name),
-//        }
-//    }
-//}
-
-///// The additional three BED6 columns.
-//#[derive(Clone, Debug)]
-//pub struct Bed6Addition {
-//    pub name: String,
-//    pub score: f64,
-//    pub strand: Option<Strand>,
-//}
-
-/// A lazy parser for BED-like files.
-/// yields [`GenomicRangeRecord<Option<Vec<String>>>`] entries. If the file is a BED3 file,
-/// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining
-/// string columns to parse.
-pub struct BedlikeIterator {
-    reader: BufReader<Box<dyn std::io::Read>>,
-    line_buffer: String,
-}
-
-impl std::fmt::Debug for BedlikeIterator {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("BedlikeIterator").finish_non_exhaustive()
-    }
-}
-
-impl BedlikeIterator {
-    /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser
-    /// assumes the first three columns are the sequence name, start (0-indexed and inclusive),
-    /// and end (0-indeed and exclusive) positions.
-    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
-        let input_file = InputStream::new(filepath);
-        // let _has_metadata = input_file.collect_metadata("#", None);
-        // let reader = input_file.continue_reading()?;
-        let reader = input_file.reader()?;
-        let line_buffer = String::with_capacity(PARSE_CAPACITY);
-        Ok(Self {
-            reader,
-            line_buffer,
-        })
-    }
-}
-
-impl Iterator for BedlikeIterator {
-    type Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.line_buffer.clear();
-
-        loop {
-            self.line_buffer.clear();
-            match self.reader.read_line(&mut self.line_buffer) {
-                Ok(0) => return None,
-                Ok(_) => {
-                    if !self.line_buffer.starts_with('#') {
-                        let line = self.line_buffer.trim_end();
-                        return Some(parse_bed_lazy(line));
-                    }
-                    // skip the metadata/comment character
-                }
-                Err(e) => return Some(Err(GRangesError::IOError(e))),
-            }
-        }
-    }
-}
-
-/// Lazily parses a BED* format line into the first three columns defining the range,
-/// storing the rest as a `String`.
-///
-/// Unlike [`parse_bedlike()`], this function returns a concrete
-/// [`GenomicRangeRecord<Option<String>>`] type, not a [`Vec<String>`] of split TSV columns.
-pub fn parse_bed_lazy(line: &str) -> Result<GenomicRangeRecord<Option<String>>, GRangesError> {
-    let columns: Vec<&str> = line.splitn(4, '\t').collect();
-    if columns.len() < 3 {
-        return Err(GRangesError::Bed3TooFewColumns(
-            columns.len(),
-            line.to_string(),
-        ));
-    }
-
-    let seqname = parse_column(columns[0], line)?;
-    let start: Position = parse_column(columns[1], line)?;
-    let end: Position = parse_column(columns[2], line)?;
-
-    let data = if columns.len() > 3 {
-        Some(columns[3].to_string())
-    } else {
-        None
-    };
-
-    Ok(GenomicRangeRecord {
-        seqname,
-        start,
-        end,
-        data,
-    })
-}
-
-#[allow(dead_code)]
-fn parse_strand(symbol: char) -> Result<Option<Strand>, GRangesError> {
-    match symbol {
-        '+' => Ok(Some(Strand::Forward)),
-        '-' => Ok(Some(Strand::Reverse)),
-        '.' => Ok(None),
-        _ => Err(GRangesError::InvalidString),
-    }
-}
-
-// TODO
-///// Parses a BED6 format line into the three columns defining the range, and additional
-///// columns
-/////
-//pub fn parse_bed6(line: &str) -> Result<GenomicRangeRecord<Bed5Addition>, GRangesError> {
-//    let columns: Vec<&str> = line.splitn(4, '\t').collect();
-//    if columns.len() < 3 {
-//        return Err(GRangesError::BedlikeTooFewColumns(line.to_string()));
-//    }
-//
-//    let seqname = parse_column(columns[0], line)?;
-//    let start: Position = parse_column(columns[1], line)?;
-//    let end: Position = parse_column(columns[2], line)?;
-//
-//    let name = parse_column(columns[3], line)?;
-//    // let strand: Option<Strand> = parse_strand(parse_column(columns[3], line)?)?;
-//
-//    let data = Bed5Addition { name, score };
-//
-//    Ok(GenomicRangeRecord {
-//        seqname,
-//        start,
-//        end,
-//        data,
-//    })
-//}
diff --git a/src/io/parsers/bed/bed3.rs b/src/io/parsers/bed/bed3.rs
new file mode 100644
index 0000000..0458b99
--- /dev/null
+++ b/src/io/parsers/bed/bed3.rs
@@ -0,0 +1,26 @@
+//! BED3 Parsers, which are built off of the [`GenomicRangeRecordEmpty`].
+//!
+
+use crate::{io::TsvRecordIterator, ranges::GenomicRangeRecordEmpty, GRangesError};
+use std::path::PathBuf;
+
+/// An iterator over BED3 entries (which just contain ranges no data).
+#[derive(Debug)]
+pub struct Bed3Iterator {
+    iter: TsvRecordIterator<GenomicRangeRecordEmpty>,
+}
+
+impl Bed3Iterator {
+    /// Creates a parsing iterator over a BED5 file.
+    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
+        let iter = TsvRecordIterator::new(filepath)?;
+        Ok(Self { iter })
+    }
+}
+
+impl Iterator for Bed3Iterator {
+    type Item = Result<GenomicRangeRecordEmpty, GRangesError>;
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
diff --git a/src/io/parsers/bed/bed4.rs b/src/io/parsers/bed/bed4.rs
new file mode 100644
index 0000000..6e03b1d
--- /dev/null
+++ b/src/io/parsers/bed/bed4.rs
@@ -0,0 +1,46 @@
+//! BED4 Parsers, which are built off of the [`GenomicRangeRecordEmpty`]
+//! and [`Bed4Addition`].
+
+use crate::{io::TsvRecordIterator, ranges::GenomicRangeRecord, GRangesError};
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+
+/// The additional two BED5 columns.
+///
+/// # Fields
+/// * `name`: the feature name.
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
+#[serde(deny_unknown_fields)]
+pub struct Bed4Addition {
+    pub name: String,
+}
+
+/// An iterator over BED4 entries, which contain the three
+/// range entries (sequence name, start and end positions),
+/// and a feature name
+///
+/// Note that the [`Bed5Addition`] is *permissive* in the sense
+/// it allows for more input types than the official [BED format
+/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf).
+// TODO strict type?
+#[derive(Debug)]
+pub struct Bed4Iterator {
+    iter: TsvRecordIterator<GenomicRangeRecord<Bed4Addition>>,
+}
+
+impl Bed4Iterator {
+    /// Creates a parsing iterator over a BED4 file.
+    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
+        let iter = TsvRecordIterator::new(filepath)?;
+
+        Ok(Self { iter })
+    }
+}
+
+impl Iterator for Bed4Iterator {
+    type Item = Result<GenomicRangeRecord<Bed4Addition>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
diff --git a/src/io/parsers/bed/bed5.rs b/src/io/parsers/bed/bed5.rs
new file mode 100644
index 0000000..cf0e7cb
--- /dev/null
+++ b/src/io/parsers/bed/bed5.rs
@@ -0,0 +1,52 @@
+//! BED5 Parsers, which are built off of the [`GenomicRangeRecordEmpty`]
+//! and [`Bed5Addition`].
+
+use super::bed_missing;
+use crate::{io::TsvRecordIterator, ranges::GenomicRangeRecord, GRangesError};
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+
+/// The additional two BED5 columns.
+///
+/// # Fields
+/// * `name`: the feature name.
+/// * `score`: a score.
+// TODO/RENAME: maybe since this goes against spec it should
+// be called Bed5AdditionPermissive?
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
+#[serde(deny_unknown_fields)]
+pub struct Bed5Addition {
+    pub name: String,
+    #[serde(deserialize_with = "bed_missing")]
+    pub score: Option<f64>,
+}
+
+/// An iterator over BED5 entries, which contain the three
+/// range entries (sequence name, start and end positions),
+/// a feature name, and a score.
+///
+/// Note that the [`Bed5Addition`] is *permissive* in the sense
+/// it allows for more input types than the official [BED format
+/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf).
+// TODO strict type?
+#[derive(Debug)]
+pub struct Bed5Iterator {
+    iter: TsvRecordIterator<GenomicRangeRecord<Bed5Addition>>,
+}
+
+impl Bed5Iterator {
+    /// Creates a parsing iterator over a BED5 file.
+    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
+        let iter = TsvRecordIterator::new(filepath)?;
+
+        Ok(Self { iter })
+    }
+}
+
+impl Iterator for Bed5Iterator {
+    type Item = Result<GenomicRangeRecord<Bed5Addition>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
diff --git a/src/io/parsers/bed/bedlike.rs b/src/io/parsers/bed/bedlike.rs
new file mode 100644
index 0000000..3244e67
--- /dev/null
+++ b/src/io/parsers/bed/bedlike.rs
@@ -0,0 +1,161 @@
+//! Bed-like file lazy parsers.
+
+use std::{
+    io::{BufRead, BufReader},
+    path::PathBuf,
+};
+
+use crate::{
+    io::{
+        parsers::{tsv::build_tsv_reader, utils::parse_column},
+        InputStream,
+    },
+    ranges::GenomicRangeRecord,
+    GRangesError, Position,
+};
+
+// for BedlikeIterator only, TODO needed?
+pub const PARSE_CAPACITY: usize = 512;
+
+/// A lazy parser for BED-like files.
+/// yields [`GenomicRangeRecord<Option<Vec<String>>>`] entries. If the file is a BED3 file,
+/// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining
+/// string columns to parse.
+pub struct BedlikeIterator {
+    reader: BufReader<Box<dyn std::io::Read>>,
+    line_buffer: String,
+}
+
+impl std::fmt::Debug for BedlikeIterator {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("BedlikeIterator").finish_non_exhaustive()
+    }
+}
+
+impl BedlikeIterator {
+    /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser
+    /// assumes the first three columns are the sequence name, start (0-indexed and inclusive),
+    /// and end (0-indeed and exclusive) positions.
+    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
+        let input_file = InputStream::new(filepath);
+        // let _has_metadata = input_file.collect_metadata("#", None);
+        // let reader = input_file.continue_reading()?;
+        let reader = input_file.reader()?;
+        let line_buffer = String::with_capacity(PARSE_CAPACITY);
+        Ok(Self {
+            reader,
+            line_buffer,
+        })
+    }
+}
+
+impl Iterator for BedlikeIterator {
+    type Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.line_buffer.clear();
+
+        loop {
+            self.line_buffer.clear();
+            match self.reader.read_line(&mut self.line_buffer) {
+                Ok(0) => return None,
+                Ok(_) => {
+                    if !self.line_buffer.starts_with('#') {
+                        let line = self.line_buffer.trim_end();
+                        return Some(parse_bed_lazy(line));
+                    }
+                    // skip the metadata/comment character
+                }
+                Err(e) => return Some(Err(GRangesError::IOError(e))),
+            }
+        }
+    }
+}
+
+/// Lazily parses a BED* format line into the first three columns defining the range,
+/// storing the rest as a `String`.
+///
+/// Unlike [`parse_bedlike()`], this function returns a concrete
+/// [`GenomicRangeRecord<Option<String>>`] type, not a [`Vec<String>`] of split TSV columns.
+pub fn parse_bed_lazy(line: &str) -> Result<GenomicRangeRecord<Option<String>>, GRangesError> {
+    let columns: Vec<&str> = line.splitn(4, '\t').collect();
+    if columns.len() < 3 {
+        return Err(GRangesError::Bed3TooFewColumns(
+            columns.len(),
+            line.to_string(),
+        ));
+    }
+
+    let seqname = parse_column(columns[0], line)?;
+    let start: Position = parse_column(columns[1], line)?;
+    let end: Position = parse_column(columns[2], line)?;
+
+    let data = if columns.len() > 3 {
+        Some(columns[3].to_string())
+    } else {
+        None
+    };
+
+    Ok(GenomicRangeRecord {
+        seqname,
+        start,
+        end,
+        data,
+    })
+}
+
+/// Inspect the first line to check that it looks like a valid BED-like
+/// file, i.e. the first column is there (there are no reasonable checks
+/// for sequence names other than presence), and the next to columns can
+/// be parsed into a [`Position`].
+// TODO/NOTE: this is an older-style parser. We might want to try
+// using csv here.
+
+pub fn valid_bedlike(filepath: impl Into<PathBuf>) -> Result<bool, GRangesError> {
+    let filepath = filepath.into();
+    let mut reader = build_tsv_reader(filepath)?;
+    let mut records = reader.records();
+    if let Some(result) = records.next() {
+        let record = result?;
+
+        if record.len() < 3 {
+            // too few columns to be BED-like
+            return Ok(false);
+        }
+
+        // Attempt to parse the second and third columns as positions
+        let start_result = record.get(1).unwrap().trim().parse::<usize>();
+        let end_result = record.get(2).unwrap().trim().parse::<usize>();
+
+        // Check if both positions are valid
+        match (start_result, end_result) {
+            (Ok(_), Ok(_)) => Ok(true), // both are valid
+            _ => Ok(false),             // one or both is not valid
+        }
+    } else {
+        // No records found
+        Ok(false)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::valid_bedlike;
+
+    #[test]
+    fn test_valid_bedlike() {
+        // even bed files work
+        assert_eq!(valid_bedlike("tests_data/example.bed").unwrap(), true);
+
+        // but not everything
+        assert_eq!(
+            valid_bedlike("tests_data/invalid_format.bed").unwrap(),
+            false
+        );
+
+        assert_eq!(
+            valid_bedlike("tests_data/example_bedlike.tsv").unwrap(),
+            true
+        );
+    }
+}
diff --git a/src/io/parsers/bed/mod.rs b/src/io/parsers/bed/mod.rs
new file mode 100644
index 0000000..80fdd46
--- /dev/null
+++ b/src/io/parsers/bed/mod.rs
@@ -0,0 +1,99 @@
+//! BED Types and Functionality
+//!
+//! The BED (Browser Extensible Format) is a TSV format in bioinformatics.
+//! It has a fairly strict [specification](https://samtools.github.io/hts-specs/BEDv1.pdf),
+//! but in practice it is quite permissive, and in bioinformatics one encounters lots
+//! of "BED-like" files.
+//!
+//! # Design
+//!
+//! Since BED files can be thought of BED3 + an optional *addition*, the type
+//! returned by these parsing iterators is [`GenomicRangeRecord<U>`] where
+//! `U` is some sort of addition type like [`Bed5Addition`].
+//!
+//! # ⚠️ Stability
+//!
+//! This module defines core BED types, but is under active development.
+//!
+
+pub mod bed3;
+pub mod bed4;
+pub mod bed5;
+pub mod bedlike;
+
+pub use bed3::Bed3Iterator;
+pub use bed4::{Bed4Addition, Bed4Iterator};
+pub use bed5::{Bed5Addition, Bed5Iterator};
+pub use bedlike::{valid_bedlike, BedlikeIterator};
+
+use serde::de::Error as DeError;
+use serde::{Deserialize, Deserializer};
+use std::str::FromStr;
+
+/// [`serde`] deserializer for a BED column with a possibly missing value. Note that the [BED
+/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf) only technically allows `'.'` to
+/// be used for missing strands, but in practice it can be found to represent
+/// missing scores, etc too.
+pub fn bed_missing<'de, D, T>(deserializer: D) -> Result<Option<T>, D::Error>
+where
+    D: Deserializer<'de>,
+    T: Deserialize<'de> + FromStr,
+    <T as FromStr>::Err: std::fmt::Display,
+{
+    let missing_chars = &["."];
+    deserialize_option_generic(deserializer, missing_chars) // Use the generic deserializer with specific placeholders
+}
+
+/// Nucleotide strand enum type.
+#[derive(Clone, Debug)]
+pub enum Strand {
+    Forward,
+    Reverse,
+}
+
+/// Deserializes some value of type `t` with some possible missing
+/// character `missing_chars` into [`Option<T>`].
+pub fn deserialize_option_generic<'de, D, T>(
+    deserializer: D,
+    missing_chars: &'de [&'de str],
+) -> Result<Option<T>, D::Error>
+where
+    D: Deserializer<'de>,
+    T: Deserialize<'de> + FromStr,
+    <T as FromStr>::Err: std::fmt::Display,
+{
+    let s: String = Deserialize::deserialize(deserializer)?;
+    if missing_chars.contains(&s.as_str()) {
+        Ok(None)
+    } else {
+        s.parse::<T>()
+            .map(Some)
+            .map_err(|e| DeError::custom(format!("parsing error: {}", e)))
+    }
+}
+// TODO
+///// Parses a BED6 format line into the three columns defining the range, and additional
+///// columns
+/////
+//pub fn parse_bed6(line: &str) -> Result<GenomicRangeRecord<Bed5Addition>, GRangesError> {
+//    let columns: Vec<&str> = line.splitn(4, '\t').collect();
+//    if columns.len() < 3 {
+//        return Err(GRangesError::BedlikeTooFewColumns(line.to_string()));
+//    }
+//
+//    let seqname = parse_column(columns[0], line)?;
+//    let start: Position = parse_column(columns[1], line)?;
+//    let end: Position = parse_column(columns[2], line)?;
+//
+//    let name = parse_column(columns[3], line)?;
+//    // let strand: Option<Strand> = parse_strand(parse_column(columns[3], line)?)?;
+//
+//    let data = Bed5Addition { name, score };
+//
+//    Ok(GenomicRangeRecord {
+//        seqname,
+//        start,
+//        end,
+//        data,
+//    })
+//}
diff --git a/src/io/parsers/detect.rs b/src/io/parsers/detect.rs
new file mode 100644
index 0000000..54f0312
--- /dev/null
+++ b/src/io/parsers/detect.rs
@@ -0,0 +1,181 @@
+//! Filetype detection functionality.
+//!
+
+use serde::Deserialize;
+use std::path::PathBuf;
+
+use super::{
+    bed::{valid_bedlike, Bed4Addition, Bed4Iterator},
+    tsv::build_tsv_reader,
+    utils::get_base_extension,
+    Bed3Iterator, Bed5Addition, Bed5Iterator, BedlikeIterator,
+};
+use crate::{
+    ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty},
+    GRangesError,
+};
+
+/// Enum that connects a genomic ranges file type to its specific parser.
+#[derive(Debug)]
+pub enum GenomicRangesParser {
+    Bed3(Bed3Iterator),
+    Bed4(Bed4Iterator),
+    Bed5(Bed5Iterator),
+    Bedlike(BedlikeIterator),
+    Unsupported,
+}
+
+/// Enum that indicates the filetype of some genomic ranges file.
+#[derive(Debug, PartialEq)]
+pub enum GenomicRangesFile {
+    Bed3(PathBuf),
+    Bed4(PathBuf),
+    Bed5(PathBuf),
+    Bedlike(PathBuf),
+    Unsupported,
+}
+
+/// Detect the BED variant by trying to deserialize the first line.
+pub fn detect_bed_variant(
+    filepath: impl Into<PathBuf>,
+) -> Result<Option<GenomicRangesFile>, GRangesError> {
+    let filepath = filepath.into();
+
+    if try_deserialize::<GenomicRangeRecord<Bed5Addition>>(&filepath)? {
+        Ok(Some(GenomicRangesFile::Bed5(filepath)))
+    } else if try_deserialize::<GenomicRangeRecord<Bed4Addition>>(&filepath)? {
+        Ok(Some(GenomicRangesFile::Bed4(filepath)))
+    } else if try_deserialize::<GenomicRangeRecordEmpty>(&filepath)? {
+        Ok(Some(GenomicRangesFile::Bed3(filepath)))
+    } else {
+        Ok(None)
+    }
+}
+
+/// Try to deserialize into a generic type `T`.
+fn try_deserialize<T: for<'de> Deserialize<'de> + std::fmt::Debug>(
+    filepath: impl Into<PathBuf>,
+) -> Result<bool, GRangesError> {
+    let filepath = filepath.into();
+    let reader = build_tsv_reader(&filepath)?;
+    let mut iter = reader.into_deserialize::<T>();
+    let next_item = iter.next();
+    if let Some(result) = next_item {
+        Ok(result.is_ok())
+    } else {
+        Err(GRangesError::EmptyFile(
+            filepath.to_string_lossy().to_string(),
+        ))
+    }
+}
+
+impl GenomicRangesFile {
+    /// Detect the type of range genomic range file type we are working with, and output
+    /// the appropriate [`GenomicRangesFile`] enum variant.
+    ///
+    /// Detection works like this:
+    ///  1. Skip comment lines, starting with `#`.
+    ///  2. Retrieve extension, removing any additional compression-related
+    ///     extensions (`.gz` and `.bgz`) if present.
+    ///  3. Read the first line, and try to parse the second and third columns
+    ///     into [`Position`] types, to check if this is a BED-like file (i.e.
+    ///     the first three columns are sequence name, start, and end positions).
+    ///  4. Match against
+    ///
+    /// Currently this supports:
+    ///  1. BED3 files, without range data.
+    ///  2. BED-like files, with BED3 first columns and optional additional columns
+    ///     after that. If there is no additional columnar data after the first
+    ///     three BED3 columns, the type is [`GenomicRangesFile::Bed3`]. If there
+    ///     is additional columnar data, it is [`GenomicRangesFile::Bedlike`].
+    ///  3. Files with `.tsv` extension but are BED-like (first three
+    ///     columns are BED3) will have the type [`GenomicRangesFile::Bed3`]. This
+    ///     is because downstream [`GRanges`] operations need to know if any
+    ///     additional data is present, which would need to be put in a data container.
+    ///  4. BED5 files, which are BED3 + a *feature name* and a *strand* column.
+    ///  5. If the file type does not satisfy any of the rules above, it is
+    ///     [`GenomicRangesFile::Unsupported`].
+    ///
+    /// See the `match` statement in the source code for the exact rules. Additional
+    /// genomic range file formats like GTF/GFF can easily be added later.
+    ///
+    /// [`GRanges`]: crate::granges::GRanges
+    pub fn detect(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
+        let filepath: PathBuf = filepath.into();
+
+        let is_valid_bedlike = valid_bedlike(&filepath)?;
+
+        // get the extension, as a hint
+        let extension =
+            get_base_extension(&filepath).ok_or(GRangesError::CouldNotDetectRangesFiletype)?;
+
+        // If it's got a .tsv extension, take this as a hint it *isn't a BED*,
+        // thus, this goes to the BedlikeIterator parser.
+        if extension.ends_with("tsv") && is_valid_bedlike {
+            return Ok(GenomicRangesFile::Bedlike(filepath));
+        }
+
+        // Let's try the strict serde-based deserialization approach first.
+        if let Some(bed_filetype) = detect_bed_variant(&filepath)? {
+            return Ok(bed_filetype);
+        }
+
+        if is_valid_bedlike {
+            return Ok(GenomicRangesFile::Bedlike(filepath));
+        }
+
+        Ok(GenomicRangesFile::Unsupported)
+    }
+
+    /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error
+    /// if the filetype is not supported.
+    ///
+    /// This returns a [`GenomicRangesParser`] enum, since the parsing iterator filetype
+    /// cannot be known at compile time.
+    pub fn parsing_iterator(
+        filepath: impl Clone + Into<PathBuf>,
+    ) -> Result<GenomicRangesParser, GRangesError> {
+        let path = filepath.into();
+        match Self::detect(path)? {
+            GenomicRangesFile::Bed3(path) => {
+                Ok(GenomicRangesParser::Bed3(Bed3Iterator::new(path)?))
+            }
+            GenomicRangesFile::Bed4(path) => {
+                Ok(GenomicRangesParser::Bed4(Bed4Iterator::new(path)?))
+            }
+            GenomicRangesFile::Bed5(path) => {
+                Ok(GenomicRangesParser::Bed5(Bed5Iterator::new(path)?))
+            }
+            GenomicRangesFile::Bedlike(path) => {
+                Ok(GenomicRangesParser::Bedlike(BedlikeIterator::new(path)?))
+            }
+            GenomicRangesFile::Unsupported => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::GenomicRangesFile;
+
+    #[test]
+    fn test_rangefiletype_detect() {
+        let range_filetype = GenomicRangesFile::detect("tests_data/example.bed");
+        assert!(matches!(
+            range_filetype.unwrap(),
+            GenomicRangesFile::Bed3(_)
+        ));
+
+        let range_filetype = GenomicRangesFile::detect("tests_data/example_bedlike.tsv");
+        assert!(matches!(
+            range_filetype.unwrap(),
+            GenomicRangesFile::Bedlike(_)
+        ));
+
+        let range_filetype = GenomicRangesFile::detect("tests_data/test_case_03.bed");
+        assert!(matches!(
+            range_filetype.unwrap(),
+            GenomicRangesFile::Bed5(_)
+        ));
+    }
+}
diff --git a/src/io/parsers/filters.rs b/src/io/parsers/filters.rs
new file mode 100644
index 0000000..d179ebc
--- /dev/null
+++ b/src/io/parsers/filters.rs
@@ -0,0 +1,268 @@
+//! Filters for parsing iterators.
+
+use crate::error::GRangesError;
+use crate::io::TsvRecordIterator;
+use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty};
+use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable};
+use std::collections::HashSet;
+
+use super::bed::{Bed4Addition, Bed4Iterator};
+use super::{Bed3Iterator, Bed5Addition, Bed5Iterator, BedlikeIterator};
+
+/// An iterator over a generic "genomic range like " item type `R`, that filters based on sequence name.
+///
+/// Note that that the exclude filter is prioritized over the retain filter. So,
+/// if a pipeline contains both, then if a chromosome is supplied to
+/// [`GeneralIntervalIterator.exclude_seqnames()`], then even if it is in retain,
+/// it is skipped. This is to prevent validation and returning a [`Result`].
+///
+/// # Example
+///
+/// ```
+/// use granges::prelude::*;
+///
+/// let iter = Bed3Iterator::new("tests_data/example.bed")
+///            .expect("error reading file")
+///            .exclude_seqnames(&vec!["chr1".to_string()]);
+///
+/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 };
+/// let gr = GRangesEmpty::from_iter(iter, &seqlens)
+///            .expect("parsing error");
+/// let mut iter = gr.iter_ranges();
+///
+/// // the first range should be the third range in the file,
+/// // chr2:4-5
+/// assert_eq!(iter.next().unwrap().start, 4);
+/// assert_eq!(iter.next().unwrap().end, 6);
+/// assert_eq!(iter.next().unwrap().end, 15);
+/// assert_eq!(iter.next(), None);
+/// ```
+#[derive(Debug)]
+pub struct FilteredRanges<I, R>
+where
+    I: Iterator<Item = Result<R, GRangesError>>,
+{
+    inner: I,
+    retain_seqnames: Option<HashSet<String>>,
+    exclude_seqnames: Option<HashSet<String>>,
+}
+
+impl<I, R> FilteredRanges<I, R>
+where
+    I: Iterator<Item = Result<R, GRangesError>>,
+{
+    pub fn new(
+        inner: I,
+        retain_seqnames: Option<&Vec<String>>,
+        exclude_seqnames: Option<&Vec<String>>,
+    ) -> Self {
+        let retain_seqnames = retain_seqnames.cloned().map(HashSet::from_iter);
+        let exclude_seqnames = exclude_seqnames.cloned().map(HashSet::from_iter);
+        Self {
+            inner,
+            retain_seqnames,
+            exclude_seqnames,
+        }
+    }
+}
+
+/// Range-filtering iterator implementation for [`GenomicRangeRecord<U>`].
+impl<I, U> Iterator for FilteredRanges<I, GenomicRangeRecord<U>>
+where
+    I: Iterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
+{
+    type Item = Result<GenomicRangeRecord<U>, GRangesError>;
+
+    /// Get the next filtered entry, prioritizing exclude over retain.
+    fn next(&mut self) -> Option<Self::Item> {
+        for item in self.inner.by_ref() {
+            match &item {
+                Ok(entry) => {
+                    if self
+                        .exclude_seqnames
+                        .as_ref()
+                        .map_or(false, |ex| ex.contains(&entry.seqname))
+                    {
+                        continue;
+                    }
+                    if self
+                        .retain_seqnames
+                        .as_ref()
+                        .map_or(true, |rt| rt.contains(&entry.seqname))
+                    {
+                        return Some(item);
+                    }
+                }
+                Err(_) => return Some(item),
+            }
+        }
+        None
+    }
+}
+
+/// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`].
+impl<I> Iterator for FilteredRanges<I, GenomicRangeRecordEmpty>
+where
+    I: Iterator<Item = Result<GenomicRangeRecordEmpty, GRangesError>>,
+{
+    type Item = Result<GenomicRangeRecordEmpty, GRangesError>;
+
+    /// Get the next filtered entry, prioritizing exclude over retain.
+    fn next(&mut self) -> Option<Self::Item> {
+        for item in self.inner.by_ref() {
+            match &item {
+                Ok(entry) => {
+                    if self
+                        .exclude_seqnames
+                        .as_ref()
+                        .map_or(false, |ex| ex.contains(&entry.seqname))
+                    {
+                        continue;
+                    }
+                    if self
+                        .retain_seqnames
+                        .as_ref()
+                        .map_or(true, |rt| rt.contains(&entry.seqname))
+                    {
+                        return Some(item);
+                    }
+                }
+                Err(_) => return Some(item),
+            }
+        }
+        None
+    }
+}
+
+impl<U> GeneralRangeRecordIterator<GenomicRangeRecord<U>>
+    for TsvRecordIterator<GenomicRangeRecord<U>>
+where
+    U: Clone + for<'de> serde::Deserialize<'de>,
+{
+    fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges<Self, GenomicRangeRecord<U>> {
+        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
+    }
+    fn exclude_seqnames(self, seqnames: &[String]) -> FilteredRanges<Self, GenomicRangeRecord<U>> {
+        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
+    }
+}
+
+impl GeneralRangeRecordIterator<GenomicRangeRecord<Option<String>>> for BedlikeIterator {
+    fn retain_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<Option<String>>> {
+        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
+    }
+    fn exclude_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<Option<String>>> {
+        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
+    }
+}
+
+impl GeneralRangeRecordIterator<GenomicRangeRecordEmpty> for Bed3Iterator {
+    fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges<Self, GenomicRangeRecordEmpty> {
+        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
+    }
+    fn exclude_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecordEmpty> {
+        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
+    }
+}
+
+impl GeneralRangeRecordIterator<GenomicRangeRecord<Bed4Addition>> for Bed4Iterator {
+    fn retain_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<Bed4Addition>> {
+        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
+    }
+    fn exclude_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<Bed4Addition>> {
+        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
+    }
+}
+
+impl GeneralRangeRecordIterator<GenomicRangeRecord<Bed5Addition>> for Bed5Iterator {
+    fn retain_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<Bed5Addition>> {
+        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
+    }
+    fn exclude_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<Bed5Addition>> {
+        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
+    }
+}
+
+impl<I> GeneralRangeRecordIterator<GenomicRangeRecord<String>> for UnwrappedRanges<I>
+where
+    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
+{
+    fn retain_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<String>> {
+        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
+    }
+    fn exclude_seqnames(
+        self,
+        seqnames: &[String],
+    ) -> FilteredRanges<Self, GenomicRangeRecord<String>> {
+        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
+    }
+}
+
+#[derive(Debug)]
+pub struct UnwrappedRanges<I>
+where
+    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
+{
+    inner: I,
+}
+
+impl<I> UnwrappedRanges<I>
+where
+    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
+{
+    pub fn new(inner: I) -> Self {
+        Self { inner }
+    }
+}
+
+impl<I> Iterator for UnwrappedRanges<I>
+where
+    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
+{
+    type Item = Result<GenomicRangeRecord<String>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(result) = self.inner.next() {
+            return Some(result.and_then(|record| match record.data {
+                Some(data) => Ok(GenomicRangeRecord::new(
+                    record.seqname,
+                    record.start,
+                    record.end,
+                    data,
+                )),
+                None => Err(GRangesError::TryUnwrapDataError),
+            }));
+        }
+        None
+    }
+}
+
+impl GenomicRangeRecordUnwrappable for BedlikeIterator {
+    fn try_unwrap_data(self) -> UnwrappedRanges<Self> {
+        UnwrappedRanges::new(self)
+    }
+}
diff --git a/src/io/parsers/mod.rs b/src/io/parsers/mod.rs
index 681620e..46ede26 100644
--- a/src/io/parsers/mod.rs
+++ b/src/io/parsers/mod.rs
@@ -62,568 +62,12 @@
 //!
 
 pub mod bed;
+pub mod detect;
+pub mod filters;
 pub mod tsv;
 pub mod utils;
 
 pub use bed::{Bed3Iterator, Bed5Addition, Bed5Iterator, BedlikeIterator};
+pub use detect::{GenomicRangesFile, GenomicRangesParser};
 
-use std::collections::HashSet;
-use std::io::BufRead;
-use std::path::PathBuf;
-
-use crate::error::GRangesError;
-use crate::io::file::InputStream;
-use crate::ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty};
-use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable};
-use crate::Position;
-
-use self::utils::get_base_extension;
-
-use super::TsvRecordIterator;
-
-/// Inspect the first line to check that it looks like a valid BED-like
-/// file, i.e. the first column is there (there are no reasonable checks
-/// for sequence names other than presence), and the next to columns can
-/// be parsed into a [`Position`].
-fn valid_bedlike(input_file: &mut InputStream) -> Result<bool, GRangesError> {
-    let _metadata = input_file.collect_metadata("#", None)?;
-    let mut reader = input_file.continue_reading()?;
-    let mut first_line = String::new();
-    reader.read_line(&mut first_line)?;
-
-    let columns = first_line
-        .splitn(4, '\t')
-        .map(String::from)
-        .collect::<Vec<String>>();
-
-    if columns.len() < 3 {
-        // too few columns to be BED-like
-        return Ok(false);
-    }
-
-    // Attempt to parse the second and third columns as positions
-    let start_result = columns[1].trim().parse::<Position>();
-    let end_result = columns[2].trim().parse::<Position>();
-
-    // Check if both positions are valid
-    match (start_result, end_result) {
-        (Ok(_), Ok(_)) => Ok(true), // both are valid
-        _ => Ok(false),             // one or both is not valid
-    }
-}
-
-// TODO: we can combine this and the above into an enum
-fn valid_bed5(input_file: &mut InputStream) -> Result<bool, GRangesError> {
-    let _metadata = input_file.collect_metadata("#", None)?;
-    let mut reader = input_file.continue_reading()?;
-    let mut first_line = String::new();
-    reader.read_line(&mut first_line)?;
-
-    let columns = first_line
-        .splitn(4, '\t')
-        .map(String::from)
-        .collect::<Vec<String>>();
-
-    if columns.len() == 5 {
-        // Attempt to parse the second and third columns as positions
-        let valid_start = columns[1].trim().parse::<Position>().is_ok();
-        let valid_end = columns[2].trim().parse::<Position>().is_ok();
-        // this should always be true
-        let _ = columns[3].trim().parse::<Position>().is_ok();
-        let valid_strand = columns[4].trim().parse::<Position>().is_ok();
-        return Ok(valid_start && valid_end && valid_strand);
-    }
-    Ok(false)
-}
-
-/// Enum that connects a genomic ranges file type to its specific parser.
-#[derive(Debug)]
-pub enum GenomicRangesParser {
-    Bed3(Bed3Iterator),
-    Bed5(Bed5Iterator),
-    Bedlike(BedlikeIterator),
-    Unsupported,
-}
-
-/// Enum that indicates the filetype of some genomic ranges file.
-#[derive(Debug, PartialEq)]
-pub enum GenomicRangesFile {
-    Bed3(PathBuf),
-    Bed5(PathBuf),
-    Bedlike(PathBuf),
-    Unsupported,
-}
-
-impl GenomicRangesFile {
-    /// Detect the type of range genomic range file type we are working with, and output
-    /// the appropriate [`GenomicRangesFile`] enum variant.
-    ///
-    /// Detection works like this:
-    ///  1. Skip comment lines, starting with `#`.
-    ///  2. Retrieve extension, removing any additional compression-related
-    ///     extensions (`.gz` and `.bgz`) if present.
-    ///  3. Read the first line, and try to parse the second and third columns
-    ///     into [`Position`] types, to check if this is a BED-like file (i.e.
-    ///     the first three columns are sequence name, start, and end positions).
-    ///  4. Match against
-    ///
-    /// Currently this supports:
-    ///  1. BED3 files, without range data.
-    ///  2. BED-like files, with BED3 first columns and optional additional columns
-    ///     after that. If there is no additional columnar data after the first
-    ///     three BED3 columns, the type is [`GenomicRangesFile::Bed3`]. If there
-    ///     is additional columnar data, it is [`GenomicRangesFile::Bedlike`].
-    ///  3. Files with `.tsv` extension but are BED-like (first three
-    ///     columns are BED3) will have the type [`GenomicRangesFile::Bed3`]. This
-    ///     is because downstream [`GRanges`] operations need to know if any
-    ///     additional data is present, which would need to be put in a data container.
-    ///  4. BED5 files, which are BED3 + a *feature name* and a *strand* column.
-    ///  5. If the file type does not satisfy any of the rules above, it is
-    ///     [`GenomicRangesFile::Unsupported`].
-    ///
-    /// See the `match` statement in the source code for the exact rules. Additional
-    /// genomic range file formats like GTF/GFF can easily be added later.
-    ///
-    /// [`GRanges`]: crate::granges::GRanges
-    pub fn detect(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
-        let path: PathBuf = filepath.into();
-        let mut input_file = InputStream::new(&path);
-        let _metadata = input_file.collect_metadata("#", None)?;
-        let number_columns = input_file.detect_columns("\t")?;
-
-        // get the extension, as a hint
-        let extension =
-            get_base_extension(&path).ok_or(GRangesError::CouldNotDetectRangesFiletype)?;
-
-        // test if the first row can be parsed into a BED-like file format.
-        let is_valid_bedlike = valid_bedlike(&mut input_file)?;
-        let is_valid_bed5 = valid_bed5(&mut input_file)?; // TODO OPTIMIZE merge with above
-        let file_type = match (
-            extension.as_str(),
-            number_columns,
-            is_valid_bedlike,
-            is_valid_bed5,
-        ) {
-            ("bed", 3, true, false) => GenomicRangesFile::Bed3(path),
-            ("tsv", 3, true, false) => GenomicRangesFile::Bed3(path),
-            ("bed", 5, true, true) => GenomicRangesFile::Bed5(path),
-            ("bed", n, true, false) if n > 3 => GenomicRangesFile::Bedlike(path),
-            ("tsv", n, true, false) if n > 3 => GenomicRangesFile::Bedlike(path),
-            _ => GenomicRangesFile::Unsupported,
-        };
-        Ok(file_type)
-    }
-
-    /// Detect the genomic range filetype and link it to its parsing iterator, or raise an error
-    /// if the filetype is not supported.
-    ///
-    /// This returns a [`GenomicRangesParser`] enum, since the parsing iterator filetype
-    /// cannot be known at compile time.
-    pub fn parsing_iterator(
-        filepath: impl Clone + Into<PathBuf>,
-    ) -> Result<GenomicRangesParser, GRangesError> {
-        let path = filepath.into();
-        match Self::detect(path)? {
-            GenomicRangesFile::Bed3(path) => {
-                Ok(GenomicRangesParser::Bed3(Bed3Iterator::new(path)?))
-            }
-            GenomicRangesFile::Bed5(path) => {
-                Ok(GenomicRangesParser::Bed5(Bed5Iterator::new(path)?))
-            }
-            GenomicRangesFile::Bedlike(path) => {
-                Ok(GenomicRangesParser::Bedlike(BedlikeIterator::new(path)?))
-            }
-            GenomicRangesFile::Unsupported => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
-        }
-    }
-}
-
-/// An iterator over a generic "genomic range like " item type `R`, that filters based on sequence name.
-///
-/// Note that that the exclude filter is prioritized over the retain filter. So,
-/// if a pipeline contains both, then if a chromosome is supplied to
-/// [`GeneralIntervalIterator.exclude_seqnames()`], then even if it is in retain,
-/// it is skipped. This is to prevent validation and returning a [`Result`].
-///
-/// # Example
-///
-/// ```
-/// use granges::prelude::*;
-///
-/// let iter = Bed3Iterator::new("tests_data/example.bed")
-///            .expect("error reading file")
-///            .exclude_seqnames(&vec!["chr1".to_string()]);
-///
-/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 };
-/// let gr = GRangesEmpty::from_iter(iter, &seqlens)
-///            .expect("parsing error");
-/// let mut iter = gr.iter_ranges();
-///
-/// // the first range should be the third range in the file,
-/// // chr2:4-5
-/// assert_eq!(iter.next().unwrap().start, 4);
-/// assert_eq!(iter.next().unwrap().end, 6);
-/// assert_eq!(iter.next().unwrap().end, 15);
-/// assert_eq!(iter.next(), None);
-/// ```
-#[derive(Debug)]
-pub struct FilteredRanges<I, R>
-where
-    I: Iterator<Item = Result<R, GRangesError>>,
-{
-    inner: I,
-    retain_seqnames: Option<HashSet<String>>,
-    exclude_seqnames: Option<HashSet<String>>,
-}
-
-impl<I, R> FilteredRanges<I, R>
-where
-    I: Iterator<Item = Result<R, GRangesError>>,
-{
-    pub fn new(
-        inner: I,
-        retain_seqnames: Option<&Vec<String>>,
-        exclude_seqnames: Option<&Vec<String>>,
-    ) -> Self {
-        let retain_seqnames = retain_seqnames.cloned().map(HashSet::from_iter);
-        let exclude_seqnames = exclude_seqnames.cloned().map(HashSet::from_iter);
-        Self {
-            inner,
-            retain_seqnames,
-            exclude_seqnames,
-        }
-    }
-}
-
-/// Range-filtering iterator implementation for [`GenomicRangeRecord<U>`].
-impl<I, U> Iterator for FilteredRanges<I, GenomicRangeRecord<U>>
-where
-    I: Iterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
-{
-    type Item = Result<GenomicRangeRecord<U>, GRangesError>;
-
-    /// Get the next filtered entry, prioritizing exclude over retain.
-    fn next(&mut self) -> Option<Self::Item> {
-        for item in self.inner.by_ref() {
-            match &item {
-                Ok(entry) => {
-                    if self
-                        .exclude_seqnames
-                        .as_ref()
-                        .map_or(false, |ex| ex.contains(&entry.seqname))
-                    {
-                        continue;
-                    }
-                    if self
-                        .retain_seqnames
-                        .as_ref()
-                        .map_or(true, |rt| rt.contains(&entry.seqname))
-                    {
-                        return Some(item);
-                    }
-                }
-                Err(_) => return Some(item),
-            }
-        }
-        None
-    }
-}
-
-/// Range-filtering iterator implementation for [`GenomicRangeEmptyRecord`].
-impl<I> Iterator for FilteredRanges<I, GenomicRangeRecordEmpty>
-where
-    I: Iterator<Item = Result<GenomicRangeRecordEmpty, GRangesError>>,
-{
-    type Item = Result<GenomicRangeRecordEmpty, GRangesError>;
-
-    /// Get the next filtered entry, prioritizing exclude over retain.
-    fn next(&mut self) -> Option<Self::Item> {
-        for item in self.inner.by_ref() {
-            match &item {
-                Ok(entry) => {
-                    if self
-                        .exclude_seqnames
-                        .as_ref()
-                        .map_or(false, |ex| ex.contains(&entry.seqname))
-                    {
-                        continue;
-                    }
-                    if self
-                        .retain_seqnames
-                        .as_ref()
-                        .map_or(true, |rt| rt.contains(&entry.seqname))
-                    {
-                        return Some(item);
-                    }
-                }
-                Err(_) => return Some(item),
-            }
-        }
-        None
-    }
-}
-
-impl<U> GeneralRangeRecordIterator<GenomicRangeRecord<U>>
-    for TsvRecordIterator<GenomicRangeRecord<U>>
-where
-    U: Clone + for<'de> serde::Deserialize<'de>,
-{
-    fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges<Self, GenomicRangeRecord<U>> {
-        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
-    }
-    fn exclude_seqnames(self, seqnames: &[String]) -> FilteredRanges<Self, GenomicRangeRecord<U>> {
-        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
-    }
-}
-
-impl GeneralRangeRecordIterator<GenomicRangeRecord<Option<String>>> for BedlikeIterator {
-    fn retain_seqnames(
-        self,
-        seqnames: &[String],
-    ) -> FilteredRanges<Self, GenomicRangeRecord<Option<String>>> {
-        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
-    }
-    fn exclude_seqnames(
-        self,
-        seqnames: &[String],
-    ) -> FilteredRanges<Self, GenomicRangeRecord<Option<String>>> {
-        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
-    }
-}
-
-impl GeneralRangeRecordIterator<GenomicRangeRecordEmpty> for Bed3Iterator {
-    fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges<Self, GenomicRangeRecordEmpty> {
-        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
-    }
-    fn exclude_seqnames(
-        self,
-        seqnames: &[String],
-    ) -> FilteredRanges<Self, GenomicRangeRecordEmpty> {
-        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
-    }
-}
-
-impl GeneralRangeRecordIterator<GenomicRangeRecord<Bed5Addition>> for Bed5Iterator {
-    fn retain_seqnames(
-        self,
-        seqnames: &[String],
-    ) -> FilteredRanges<Self, GenomicRangeRecord<Bed5Addition>> {
-        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
-    }
-    fn exclude_seqnames(
-        self,
-        seqnames: &[String],
-    ) -> FilteredRanges<Self, GenomicRangeRecord<Bed5Addition>> {
-        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
-    }
-}
-
-impl<I> GeneralRangeRecordIterator<GenomicRangeRecord<String>> for UnwrappedRanges<I>
-where
-    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
-{
-    fn retain_seqnames(
-        self,
-        seqnames: &[String],
-    ) -> FilteredRanges<Self, GenomicRangeRecord<String>> {
-        FilteredRanges::new(self, Some(&seqnames.to_vec()), None)
-    }
-    fn exclude_seqnames(
-        self,
-        seqnames: &[String],
-    ) -> FilteredRanges<Self, GenomicRangeRecord<String>> {
-        FilteredRanges::new(self, None, Some(&seqnames.to_vec()))
-    }
-}
-
-#[derive(Debug)]
-pub struct UnwrappedRanges<I>
-where
-    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
-{
-    inner: I,
-}
-
-impl<I> UnwrappedRanges<I>
-where
-    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
-{
-    pub fn new(inner: I) -> Self {
-        Self { inner }
-    }
-}
-
-impl<I> Iterator for UnwrappedRanges<I>
-where
-    I: Iterator<Item = Result<GenomicRangeRecord<Option<String>>, GRangesError>>,
-{
-    type Item = Result<GenomicRangeRecord<String>, GRangesError>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if let Some(result) = self.inner.next() {
-            return Some(result.and_then(|record| match record.data {
-                Some(data) => Ok(GenomicRangeRecord::new(
-                    record.seqname,
-                    record.start,
-                    record.end,
-                    data,
-                )),
-                None => Err(GRangesError::TryUnwrapDataError),
-            }));
-        }
-        None
-    }
-}
-
-impl GenomicRangeRecordUnwrappable for BedlikeIterator {
-    fn try_unwrap_data(self) -> UnwrappedRanges<Self> {
-        UnwrappedRanges::new(self)
-    }
-}
-
-/// Parses a single column from a string slice into a specified type.
-///
-/// This function is particularly useful for converting columns in genomic data files
-/// from string representation to a specific type like `usize`, `f64`, etc.
-///
-/// # Type Parameters
-///
-/// * `T`: The type to which the column should be parsed. It must implement `FromStr`.
-///
-/// # Arguments
-///
-/// * `column`: The string slice representing the column to be parsed.
-/// * `line`: The entire line from which the column was extracted. Used for error reporting.
-///
-/// # Errors
-///
-/// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`.
-pub fn parse_column<T: std::str::FromStr>(column: &str, line: &str) -> Result<T, GRangesError>
-where
-    <T as std::str::FromStr>::Err: std::fmt::Debug,
-{
-    column
-        .parse::<T>()
-        .map_err(|_| GRangesError::InvalidColumnType {
-            expected_type: std::any::type_name::<T>().to_string(), // Provides the expected type name
-            found_value: column.to_string(),
-            line: line.to_string(),
-        })
-}
-
-/// Parses a BED-like TSV format line into its constituent components.
-///
-/// BED-like formats contain BED3 columns (chromosome, start, and end) and
-/// others. It's an non-standard but extensible (since it's just a TSV), and sensible
-/// for a variety of genomic data.
-///
-/// # Arguments
-///
-/// * `line`: A string slice representing a line in BED3 format.
-///
-/// # Errors
-///
-/// Returns `GRangesError::InvalidColumnType` if the line does not have enough columns
-/// or if any column cannot be correctly parsed.
-pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str>), GRangesError> {
-    let columns: Vec<&str> = line.split('\t').collect();
-    if columns.len() < 3 {
-        return Err(GRangesError::BedTooFewColumns(
-            columns.len(),
-            3,
-            line.to_string(),
-        ));
-    }
-
-    let seqname = parse_column(columns[0], line)?;
-    let start: Position = parse_column(columns[1], line)?;
-    let end: Position = parse_column(columns[2], line)?;
-
-    // Collect remaining columns
-    let additional_columns = columns[3..].to_vec();
-
-    Ok((seqname, start, end, additional_columns))
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::io::{
-        parsers::{utils::get_base_extension, valid_bedlike},
-        InputStream,
-    };
-
-    use super::GenomicRangesFile;
-
-    #[test]
-    fn test_get_base_extension() {
-        assert_eq!(get_base_extension("test.bed.gz").unwrap(), "bed");
-        assert_eq!(get_base_extension("test.bed").unwrap(), "bed");
-        assert_eq!(get_base_extension("some/path/test.bed.gz").unwrap(), "bed");
-        assert_eq!(get_base_extension("some/path/test.bed").unwrap(), "bed");
-        assert_eq!(get_base_extension("some/path/test.gff").unwrap(), "gff");
-        assert_eq!(get_base_extension("some/path/test.gff.gz").unwrap(), "gff");
-        assert_eq!(get_base_extension("test.gff.gz").unwrap(), "gff");
-        assert_eq!(get_base_extension("test.gff").unwrap(), "gff");
-        assert_eq!(get_base_extension("test"), None);
-        assert_eq!(get_base_extension("foo/test"), None);
-    }
-
-    #[test]
-    fn test_rangefiletype_detect() {
-        let range_filetype = GenomicRangesFile::detect("tests_data/example.bed");
-        assert!(matches!(
-            range_filetype.unwrap(),
-            GenomicRangesFile::Bed3(_)
-        ));
-
-        let range_filetype = GenomicRangesFile::detect("tests_data/example_bedlike.tsv");
-        assert!(matches!(
-            range_filetype.unwrap(),
-            GenomicRangesFile::Bedlike(_)
-        ));
-    }
-
-    #[test]
-    fn test_valid_bedlike() {
-        assert_eq!(
-            valid_bedlike(&mut InputStream::new("tests_data/example.bed")).unwrap(),
-            true
-        );
-        assert_eq!(
-            valid_bedlike(&mut InputStream::new("tests_data/invalid_format.bed")).unwrap(),
-            false
-        );
-    }
-
-    //#[test]
-    //fn test_parser() {
-    //    // based on this example: https://docs.rs/noodles-bed/latest/noodles_bed/struct.Reader.html#method.records
-    //    let iter = Bed3Iterator::new("tests_data/noodles_example.bed").unwrap();
-
-    //    let seqlens = seqlens! { "sq0" => 10 };
-    //    let gr: GRanges<VecRangesEmpty, _> = GRanges::from_iter(iter, &seqlens).unwrap();
-
-    //    assert_eq!(gr.len(), 2);
-
-    //    // let mut gr_iter = gr.iter_ranges();
-    //    // let first_interval = gr_iter.next().unwrap();
-    //    // assert_eq!(first_interval.first, 7);
-    //    // assert_eq!(first_interval.last, 12);
-    //    //
-    //    // let second_interval = gr_iter.next().unwrap();
-    //    // assert_eq!(second_interval.first, 20);
-    //    // assert_eq!(second_interval.last, 33);
-    //}
-
-    //#[test]
-    //fn test_invalid_bedlike_iterator() {
-    //    let iter = BedlikeIterator::new("tests_data/invalid.bed").unwrap();
-    //    let seqlens = seqlens! { "sq0" => 10 };
-    //    let result: Result<GRanges<VecRangesEmpty, _>, _> = GRanges::from_iter(iter.drop_data(), &seqlens);
-
-    //    // note: the Rust LSP thinks this isn't used for some reason, so prefaced with _
-    //    // to silence warnings.
-    //    let _msg = "column '-1' in 'chr1\t-1\t20'".to_string();
-    //    assert!(matches!(result, Err(GRangesError::InvalidColumnType(_msg))));
-    //}
-}
+pub use filters::{FilteredRanges, UnwrappedRanges};
diff --git a/src/io/parsers/tsv.rs b/src/io/parsers/tsv.rs
index 7c89864..13b6702 100644
--- a/src/io/parsers/tsv.rs
+++ b/src/io/parsers/tsv.rs
@@ -1,7 +1,7 @@
 //! Essential TSV parsing functionality, which wraps the blazingly-fast [`csv`] crate's
 //! deserialization method using [`serde`].
 
-use csv::{DeserializeRecordsIntoIter, ReaderBuilder};
+use csv::{DeserializeRecordsIntoIter, Reader, ReaderBuilder};
 use flate2::read::GzDecoder;
 use serde::de::Error as DeError;
 use serde::{Deserialize, Deserializer};
@@ -12,6 +12,38 @@ use std::str::FromStr;
 
 use crate::error::GRangesError;
 
+/// Build a TSV reader which ignores comment lines, works on gzip-compressed
+/// files, etc.
+///
+/// # ⚠️ Stability
+///
+/// This will likely change into a type with methods.
+///
+/// # Developers Notes
+///
+/// Currently headers are ignored, and not properly passed
+/// to the output. If you need this feature prioritized, please submit a
+/// GitHub issue.
+pub fn build_tsv_reader(
+    filepath: impl Into<PathBuf>,
+) -> Result<Reader<Box<dyn Read>>, GRangesError> {
+    let filepath = filepath.into();
+    let file = File::open(&filepath)?;
+    let is_gzipped = is_gzipped_file(&filepath)?;
+    let stream: Box<dyn Read> = if is_gzipped {
+        Box::new(GzDecoder::new(file))
+    } else {
+        Box::new(file)
+    };
+
+    let reader = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .has_headers(false)
+        .comment(Some(b'#'))
+        .from_reader(stream);
+    Ok(reader)
+}
+
 /// Deserializes some value of type `t` with some possible missing
 /// character `missing_chars` into [`Option<T>`].
 pub fn deserialize_option_generic<'de, D, T>(
@@ -47,7 +79,7 @@ impl<T> std::fmt::Debug for TsvRecordIterator<T> {
 }
 
 /// Check if a file is a gzipped by looking for the magic numbers
-fn is_gzipped_file(file_path: impl Into<PathBuf>) -> io::Result<bool> {
+pub fn is_gzipped_file(file_path: impl Into<PathBuf>) -> io::Result<bool> {
     let mut file = File::open(file_path.into())?;
     let mut buffer = [0; 2];
     file.read_exact(&mut buffer)?;
@@ -68,21 +100,7 @@ where
     /// E.g. for VCF, it would need to be parsed.
     pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
         let filepath = filepath.into();
-
-        let file = File::open(&filepath)?;
-        let is_gzipped = is_gzipped_file(&filepath)?;
-        let stream: Box<dyn Read> = if is_gzipped {
-            Box::new(GzDecoder::new(file))
-        } else {
-            Box::new(file)
-        };
-
-        let reader = ReaderBuilder::new()
-            .delimiter(b'\t')
-            .has_headers(false)
-            .comment(Some(b'#'))
-            .from_reader(stream);
-
+        let reader = build_tsv_reader(filepath)?;
         let inner = reader.into_deserialize();
 
         Ok(Self { inner })
diff --git a/src/io/parsers/utils.rs b/src/io/parsers/utils.rs
index a841036..843f950 100644
--- a/src/io/parsers/utils.rs
+++ b/src/io/parsers/utils.rs
@@ -1,5 +1,71 @@
 use std::path::Path;
 
+use crate::{GRangesError, Position};
+
+/// Parses a single column from a string slice into a specified type.
+///
+/// This function is particularly useful for converting columns in genomic data files
+/// from string representation to a specific type like `usize`, `f64`, etc.
+///
+/// # Type Parameters
+///
+/// * `T`: The type to which the column should be parsed. It must implement `FromStr`.
+///
+/// # Arguments
+///
+/// * `column`: The string slice representing the column to be parsed.
+/// * `line`: The entire line from which the column was extracted. Used for error reporting.
+///
+/// # Errors
+///
+/// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`.
+pub fn parse_column<T: std::str::FromStr>(column: &str, line: &str) -> Result<T, GRangesError>
+where
+    <T as std::str::FromStr>::Err: std::fmt::Debug,
+{
+    column
+        .parse::<T>()
+        .map_err(|_| GRangesError::InvalidColumnType {
+            expected_type: std::any::type_name::<T>().to_string(), // Provides the expected type name
+            found_value: column.to_string(),
+            line: line.to_string(),
+        })
+}
+
+/// Parses a BED-like TSV format line into its constituent components.
+///
+/// BED-like formats contain BED3 columns (chromosome, start, and end) and
+/// others. It's an non-standard but extensible (since it's just a TSV), and sensible
+/// for a variety of genomic data.
+///
+/// # Arguments
+///
+/// * `line`: A string slice representing a line in BED3 format.
+///
+/// # Errors
+///
+/// Returns `GRangesError::InvalidColumnType` if the line does not have enough columns
+/// or if any column cannot be correctly parsed.
+pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str>), GRangesError> {
+    let columns: Vec<&str> = line.split('\t').collect();
+    if columns.len() < 3 {
+        return Err(GRangesError::BedTooFewColumns(
+            columns.len(),
+            3,
+            line.to_string(),
+        ));
+    }
+
+    let seqname = parse_column(columns[0], line)?;
+    let start: Position = parse_column(columns[1], line)?;
+    let end: Position = parse_column(columns[2], line)?;
+
+    // Collect remaining columns
+    let additional_columns = columns[3..].to_vec();
+
+    Ok((seqname, start, end, additional_columns))
+}
+
 /// Get the *base* extension to help infer filetype, which ignores compression-related
 /// extensions (`.gz` and `.bgz`).
 pub fn get_base_extension<P: AsRef<Path>>(filepath: P) -> Option<String> {
@@ -31,3 +97,22 @@ pub fn get_base_extension<P: AsRef<Path>>(filepath: P) -> Option<String> {
         None
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::get_base_extension;
+
+    #[test]
+    fn test_get_base_extension() {
+        assert_eq!(get_base_extension("test.bed.gz").unwrap(), "bed");
+        assert_eq!(get_base_extension("test.bed").unwrap(), "bed");
+        assert_eq!(get_base_extension("some/path/test.bed.gz").unwrap(), "bed");
+        assert_eq!(get_base_extension("some/path/test.bed").unwrap(), "bed");
+        assert_eq!(get_base_extension("some/path/test.gff").unwrap(), "gff");
+        assert_eq!(get_base_extension("some/path/test.gff.gz").unwrap(), "gff");
+        assert_eq!(get_base_extension("test.gff.gz").unwrap(), "gff");
+        assert_eq!(get_base_extension("test.gff").unwrap(), "gff");
+        assert_eq!(get_base_extension("test"), None);
+        assert_eq!(get_base_extension("foo/test"), None);
+    }
+}
diff --git a/src/iterators.rs b/src/iterators.rs
index 8e89373..3b70f4b 100644
--- a/src/iterators.rs
+++ b/src/iterators.rs
@@ -1,17 +1,18 @@
-//! Iterators over genomic ranges and their data.
+//! Iterators over genomic ranges.
 //!
-//! Note that *parsing iterators* are in [`parsers`].
+//! Note that *parsing iterators* are in [`parsers`]. These work over
 //!
 //! [`parsers`]: crate::io::parsers
 use genomap::GenomeMap;
 
 use crate::{
-    ranges::GenomicRangeIndexedRecord,
-    traits::{GenericRange, IterableRangeContainer, RangeContainer},
+    granges::GRanges,
+    ranges::{GenomicRangeIndexedRecord, GenomicRangeRecord},
+    traits::{GenericRange, IndexedDataContainer, IterableRangeContainer, RangeContainer},
 };
 
 /// An iterator yielding [`GenomicRangeIndexedRecord`], which store
-/// indices to the sequence names and data container.
+/// indices to their sequence names and data container elements.
 ///
 /// # Developer Notes
 /// Using indices, rather than references to data items and `&str` directly,
@@ -74,9 +75,80 @@ where
     }
 }
 
+/// An iterator over [`GenomicRangeRecord<U>`] with the data
+/// accessed from a [`IndexedDataContainer`].
+pub struct GRangesRecordIterator<'a, R, T>
+where
+    R: IterableRangeContainer,
+{
+    seqnames: Vec<String>,
+    ranges: &'a GenomeMap<R>,
+    data: &'a T,
+    current_seqname_index: usize,
+    current_range_iter: Box<dyn Iterator<Item = <R as IterableRangeContainer>::RangeType> + 'a>,
+}
+
+impl<'a, R, T> GRangesRecordIterator<'a, R, T>
+where
+    R: IterableRangeContainer,
+{
+    pub fn new(granges: &'a GRanges<R, T>) -> Self {
+        let current_range_iter = granges.ranges.get_by_index(0).unwrap().iter_ranges();
+        Self {
+            seqnames: granges.seqnames(),
+            ranges: &granges.ranges,
+            data: granges.data.as_ref().unwrap(),
+            current_seqname_index: 0,
+            current_range_iter,
+        }
+    }
+}
+
+impl<'a, R, T> Iterator for GRangesRecordIterator<'a, R, T>
+where
+    R: RangeContainer + IterableRangeContainer,
+    <R as IterableRangeContainer>::RangeType: GenericRange,
+    T: IndexedDataContainer,
+{
+    // no matter what the input type is, we can always
+    // turn it into this general type.
+    type Item = GenomicRangeRecord<<T as IndexedDataContainer>::OwnedItem>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some(next_range) = self.current_range_iter.next() {
+                let data = self.data.get_owned(next_range.index().unwrap());
+                return Some(GenomicRangeRecord {
+                    // TODO how expensive is cloning here?
+                    seqname: self.seqnames[self.current_seqname_index].clone(),
+                    start: next_range.start(),
+                    end: next_range.end(),
+                    data,
+                });
+            } else {
+                // try to load another sequence's set of ranges.
+                self.current_seqname_index += 1;
+                if self.current_seqname_index >= self.ranges.len() {
+                    // we're out of range container iterators
+                    return None;
+                }
+                self.current_range_iter = self
+                    .ranges
+                    .get_by_index(self.current_seqname_index)
+                    .unwrap()
+                    .iter_ranges();
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use crate::{ranges::GenomicRangeIndexedRecord, test_utilities::granges_test_case_01};
+    use crate::{
+        iterators::GRangesRecordIterator,
+        ranges::{GenomicRangeIndexedRecord, GenomicRangeRecord},
+        test_utilities::granges_test_case_01,
+    };
 
     use super::GRangesIterator;
 
@@ -89,4 +161,26 @@ mod tests {
             GenomicRangeIndexedRecord::new(0, 0, 5, Some(0))
         );
     }
+
+    #[test]
+    fn test_genomic_record_ranges_iterator() {
+        let gr = granges_test_case_01();
+        let mut iter = GRangesRecordIterator::new(&gr);
+        assert_eq!(
+            iter.next().unwrap(),
+            GenomicRangeRecord::new("chr1".to_string(), 0, 5, 1.1)
+        );
+        assert_eq!(
+            iter.next().unwrap(),
+            GenomicRangeRecord::new("chr1".to_string(), 4, 7, 8.1)
+        );
+        assert_eq!(
+            iter.next().unwrap(),
+            GenomicRangeRecord::new("chr1".to_string(), 10, 17, 10.1)
+        );
+        assert_eq!(
+            iter.next().unwrap(),
+            GenomicRangeRecord::new("chr2".to_string(), 10, 20, 3.7)
+        );
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index a50dbe4..5043309 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -359,6 +359,7 @@ pub mod granges;
 pub mod io;
 pub mod iterators;
 pub mod join;
+pub mod merging_iterators;
 pub mod ranges;
 pub mod sequences;
 pub mod test_utilities;
@@ -369,6 +370,8 @@ pub mod traits;
 pub mod commands;
 pub mod reporting;
 
+pub use crate::error::GRangesError;
+
 /// The main position type in GRanges.
 ///
 /// This type is currently an unwrapped [`u32`]. This should handle
@@ -406,15 +409,24 @@ pub mod prelude {
     pub use crate::io::file::read_seqlens;
     pub use crate::io::tsv::BED_TSV;
     pub use crate::io::{
-        Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
-        TsvRecordIterator,
+        Bed3Iterator, Bed4Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile,
+        GenomicRangesParser, TsvRecordIterator,
+    };
+    pub use crate::join::{
+        CombinedJoinData, CombinedJoinDataBothEmpty, CombinedJoinDataLeftEmpty,
+        CombinedJoinDataRightEmpty,
     };
 
     pub use crate::data::DatumType;
     pub use crate::ranges::{
+        coitrees::{COITreesEmpty, COITreesIndexed},
         try_range,
         vec::{VecRangesEmpty, VecRangesIndexed},
     };
+
+    pub use crate::merging_iterators::{
+        ConditionalMergingResultIterator, MergingEmptyResultIterator, MergingResultIterator,
+    };
     pub use crate::traits::{
         AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations,
         GenomicRangeRecordUnwrappable, GenomicRangesTsvSerialize, IndexedDataContainer,
diff --git a/src/main/mod.rs b/src/main/mod.rs
index 142b90a..373a852 100644
--- a/src/main/mod.rs
+++ b/src/main/mod.rs
@@ -3,9 +3,10 @@ use std::path::PathBuf;
 use clap::{Parser, Subcommand};
 use granges::{
     commands::{
-        granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, ProcessingMode,
+        granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, FilterChroms,
+        Merge, ProcessingMode,
     },
-    data::operations::Operation,
+    data::operations::FloatOperation,
     prelude::GRangesError,
     Position, PositionOffset,
 };
@@ -28,6 +29,8 @@ Subcommands:
   map:       Compute the left grouped overlaps between the left genomic ranges
              and right genomic ranges, and apply one or more operations to the 
              score column of the right BED5 file.
+
+  merge:     Merge ranges that are within a minimum distance of each other.
           
   windows:   Create a set of genomic windows of the specified width (in basepairs),
              stepping the specified step size (the width, by default).
@@ -79,6 +82,7 @@ enum Commands {
         sort: bool,
         // TODO add skip_missing here
     },
+    FilterChroms(FilterChroms),
     /// Filter out the left ranges that do not have overlaps with any
     /// right ranges. This is a "semi-join" in SQL terminology.
     Filter {
@@ -157,8 +161,8 @@ enum Commands {
         right: PathBuf,
 
         /// Operation
-        #[clap(short, long, value_parser = clap::value_parser!(Operation), use_value_delimiter = true, value_delimiter = ',')]
-        func: Vec<Operation>,
+        #[clap(short, long, value_parser = clap::value_parser!(FloatOperation), use_value_delimiter = true, value_delimiter = ',')]
+        func: Vec<FloatOperation>,
 
         /// An optional output file (standard output will be used if not specified)
         #[arg(short, long)]
@@ -169,6 +173,7 @@ enum Commands {
         #[arg(short, long)]
         skip_missing: bool,
     },
+    Merge(Merge),
     /// Create a set of genomic windows ranges using the specified width
     /// and step size, and output to BED3.
     ///
@@ -240,6 +245,7 @@ fn run() -> Result<(), GRangesError> {
             output,
             skip_missing,
         }) => granges_filter(genome, left, right, output.as_ref(), *skip_missing),
+        Some(Commands::FilterChroms(filter_chroms)) => filter_chroms.run(),
         Some(Commands::Flank {
             genome,
             bedfile,
@@ -301,6 +307,8 @@ fn run() -> Result<(), GRangesError> {
                 *skip_missing,
             )
         }
+        // NOTE: this is the new API, so clean!
+        Some(Commands::Merge(merge)) => merge.run(),
         Some(Commands::Windows {
             genome,
             width,
diff --git a/src/merging_iterators.rs b/src/merging_iterators.rs
new file mode 100644
index 0000000..c298785
--- /dev/null
+++ b/src/merging_iterators.rs
@@ -0,0 +1,475 @@
+//! Merging iterators
+//!
+// TODO: these probably could use better names?
+// TODO: we should support weighting by overlaps.
+use std::cmp::max;
+
+use crate::{
+    error::GRangesError,
+    ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty},
+    traits::GenericRange,
+    PositionOffset,
+};
+
+// Create a new [`MergingEmptyResultIterator`], which work over data-less
+// "empty" ranges ([`GenomicRangeRecordEmpty`] and merge them based on their
+// distance or degree of overlap.
+pub struct MergingEmptyResultIterator<I>
+where
+    I: IntoIterator<Item = Result<GenomicRangeRecordEmpty, GRangesError>>,
+{
+    last_range: Option<GenomicRangeRecordEmpty>,
+    inner: <I as IntoIterator>::IntoIter,
+    minimum_distance: PositionOffset,
+}
+
+impl<I> MergingEmptyResultIterator<I>
+where
+    I: IntoIterator<Item = Result<GenomicRangeRecordEmpty, GRangesError>>,
+{
+    pub fn new(inner: I, minimum_distance: PositionOffset) -> Self {
+        Self {
+            last_range: None,
+            inner: inner.into_iter(),
+            minimum_distance,
+        }
+    }
+}
+
+impl<I> Iterator for MergingEmptyResultIterator<I>
+where
+    I: IntoIterator<Item = Result<GenomicRangeRecordEmpty, GRangesError>>,
+{
+    type Item = Result<GenomicRangeRecordEmpty, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for result in self.inner.by_ref() {
+            let next_range = match result {
+                Ok(range) => range,
+                Err(e) => return Some(Err(e)),
+            };
+
+            if let Some(last_range) = &mut self.last_range {
+                let on_same_chrom = last_range.seqname == next_range.seqname;
+                if on_same_chrom
+                    && last_range.distance_or_overlap(&next_range) <= self.minimum_distance
+                {
+                    last_range.end = max(last_range.end, next_range.end);
+                } else {
+                    let return_range = last_range.clone();
+                    self.last_range = Some(next_range);
+                    return Some(Ok(return_range));
+                }
+            } else {
+                self.last_range = Some(next_range);
+                // If this is the first range, we continue looking for more to potentially merge with.
+                continue;
+            }
+        }
+
+        // If we get here, the inner iterator is exhausted.
+        // We need to return the last_range if it exists and make sure it's cleared for subsequent calls.
+        self.last_range.take().map(Ok)
+    }
+}
+
+/// An iterator over [`Result<GenomicRangeRecord<U>, GRangesError>`] that
+/// merges ranges that are less than some specified `minimum_distance` apart.
+/// If `minimum_distance` is negative, it is taken as a minimum overlap width
+/// to merge at.
+pub struct MergingResultIterator<I, U, V, F>
+where
+    I: IntoIterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+{
+    last_range: Option<GenomicRangeRecord<U>>,
+    inner: <I as IntoIterator>::IntoIter,
+    minimum_distance: PositionOffset,
+    func: F,
+    accumulated_data: Vec<U>,
+}
+
+impl<I, U, V, F> MergingResultIterator<I, U, V, F>
+where
+    I: IntoIterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+{
+    pub fn new(inner: I, minimum_distance: PositionOffset, func: F) -> Self {
+        Self {
+            last_range: None,
+            inner: inner.into_iter(),
+            minimum_distance,
+            func,
+            accumulated_data: Vec::new(),
+        }
+    }
+}
+
+impl<I, U, V, F> Iterator for MergingResultIterator<I, U, V, F>
+where
+    GenomicRangeRecord<U>: GenericRange,
+    GenomicRangeRecord<V>: GenericRange,
+    I: IntoIterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+{
+    type Item = Result<GenomicRangeRecord<V>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for next_result in self.inner.by_ref() {
+            match next_result {
+                Ok(next_range) => {
+                    if let Some(ref mut last_range) = self.last_range {
+                        let on_same_chrom = last_range.seqname == next_range.seqname;
+                        if on_same_chrom
+                            && last_range.distance_or_overlap(&next_range) <= self.minimum_distance
+                        {
+                            // this range overlaps the last range, so we keep accumulating data
+                            last_range.end = max(last_range.end, next_range.end);
+                            self.accumulated_data.push(next_range.data);
+                            continue;
+                        } else {
+                            // New range does not overlap the last range, so we have to finalize.
+                            // First, run function on accumulated taken data.
+                            let final_data =
+                                (self.func)(std::mem::take(&mut self.accumulated_data));
+                            let return_range = GenomicRangeRecord {
+                                seqname: last_range.seqname.clone(),
+                                start: last_range.start,
+                                end: last_range.end,
+                                data: final_data,
+                            };
+                            // Next push this new range's data to the data stack (empty after take)
+                            self.accumulated_data.push(next_range.data.clone());
+                            self.last_range = Some(next_range);
+                            // Push the new possibly-extend range with accumulated data.
+                            return Some(Ok(return_range));
+                        }
+                    } else {
+                        // There is no last range -- this must be the first.
+                        self.last_range = Some(GenomicRangeRecord {
+                            seqname: next_range.seqname,
+                            start: next_range.start,
+                            end: next_range.end,
+                            data: next_range.data.clone(),
+                        });
+                        self.accumulated_data.push(next_range.data);
+                        continue;
+                    }
+                }
+                Err(e) => return Some(Err(e)),
+            }
+        }
+
+        if let Some(last_range) = self.last_range.take() {
+            if !self.accumulated_data.is_empty() {
+                // Finalize any accumulated data
+                let final_data = (self.func)(std::mem::take(&mut self.accumulated_data));
+                return Some(Ok(GenomicRangeRecord {
+                    seqname: last_range.seqname,
+                    start: last_range.start,
+                    end: last_range.end,
+                    data: final_data,
+                }));
+            }
+        }
+
+        None // Return None when all items have been processed
+    }
+}
+
+/// A merging iterator (over [`Result<GenomicRangeRecord<U>, GRangesError`] items,
+/// i.e. from a parsing iterator) that has an additional merge condition tested
+/// by function `G` based on the items.
+pub struct ConditionalMergingResultIterator<I, U, V, F, G>
+where
+    I: IntoIterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+    G: Fn(&GenomicRangeRecord<U>, &GenomicRangeRecord<U>) -> bool,
+{
+    last_range: Option<GenomicRangeRecord<U>>,
+    inner: <I as IntoIterator>::IntoIter,
+    minimum_distance: PositionOffset,
+    func: F,
+    group: G,
+    accumulated_data: Vec<U>,
+}
+
+impl<I, U, V, F, G> ConditionalMergingResultIterator<I, U, V, F, G>
+where
+    I: IntoIterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+    G: Fn(&GenomicRangeRecord<U>, &GenomicRangeRecord<U>) -> bool,
+{
+    pub fn new(inner: I, minimum_distance: PositionOffset, func: F, group: G) -> Self {
+        Self {
+            last_range: None,
+            inner: inner.into_iter(),
+            minimum_distance,
+            func,
+            group,
+            accumulated_data: Vec::new(),
+        }
+    }
+}
+
+impl<I, U, V, F, G> Iterator for ConditionalMergingResultIterator<I, U, V, F, G>
+where
+    GenomicRangeRecord<U>: GenericRange,
+    GenomicRangeRecord<V>: GenericRange,
+    I: IntoIterator<Item = Result<GenomicRangeRecord<U>, GRangesError>>,
+    U: Clone,
+    V: Clone,
+    F: Fn(Vec<U>) -> V,
+    G: Fn(&GenomicRangeRecord<U>, &GenomicRangeRecord<U>) -> bool,
+{
+    type Item = Result<GenomicRangeRecord<V>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for next_result in self.inner.by_ref() {
+            match next_result {
+                Ok(next_range) => {
+                    if let Some(ref mut last_range) = self.last_range {
+                        // If we have an additional group-by merging condition,
+                        // check that. If not set this is always true.
+                        let satifies_groupby = (self.group)(last_range, &next_range);
+
+                        let on_same_chrom = last_range.seqname == next_range.seqname;
+                        if on_same_chrom
+                            && satifies_groupby
+                            && last_range.distance_or_overlap(&next_range) <= self.minimum_distance
+                        {
+                            // this range overlaps the last range, so we keep accumulating data
+                            last_range.end = max(last_range.end, next_range.end);
+                            self.accumulated_data.push(next_range.data);
+                            continue;
+                        } else {
+                            // New range does not overlap the last range, so we have to finalize.
+                            // First, run function on accumulated taken data.
+                            let final_data =
+                                (self.func)(std::mem::take(&mut self.accumulated_data));
+                            let return_range = GenomicRangeRecord {
+                                seqname: last_range.seqname.clone(),
+                                start: last_range.start,
+                                end: last_range.end,
+                                data: final_data,
+                            };
+                            // Next push this new range's data to the data stack (empty after take)
+                            self.accumulated_data.push(next_range.data.clone());
+                            self.last_range = Some(next_range);
+                            // Push the new possibly-extend range with accumulated data.
+                            return Some(Ok(return_range));
+                        }
+                    } else {
+                        // There is no last range -- this must be the first.
+                        self.last_range = Some(GenomicRangeRecord {
+                            seqname: next_range.seqname,
+                            start: next_range.start,
+                            end: next_range.end,
+                            data: next_range.data.clone(),
+                        });
+                        self.accumulated_data.push(next_range.data);
+                        continue;
+                    }
+                }
+                Err(e) => return Some(Err(e)),
+            }
+        }
+
+        if let Some(last_range) = self.last_range.take() {
+            if !self.accumulated_data.is_empty() {
+                // Finalize any accumulated data
+                let final_data = (self.func)(std::mem::take(&mut self.accumulated_data));
+                return Some(Ok(GenomicRangeRecord {
+                    seqname: last_range.seqname,
+                    start: last_range.start,
+                    end: last_range.end,
+                    data: final_data,
+                }));
+            }
+        }
+
+        None // Return None when all items have been processed
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        io::{parsers::Bed5Addition, Bed3Iterator, Bed5Iterator},
+        merging_iterators::{ConditionalMergingResultIterator, MergingEmptyResultIterator},
+        ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty},
+    };
+
+    use super::MergingResultIterator;
+
+    #[test]
+    fn test_merging_iterators() {
+        // NOTE/TODO: the file-based test is because we don't have a good
+        // way yet to go from `granges_test_case_01()` into a parsing-like
+        // iterator.
+        let iter = Bed5Iterator::new("tests_data/test_case_01.bed").unwrap();
+
+        let sum_scores =
+            |data: Vec<Bed5Addition>| data.iter().map(|bed5| bed5.score.unwrap()).sum::<f64>();
+
+        let merged_iter = MergingResultIterator::new(iter, 0, sum_scores);
+
+        let results: Vec<_> = Result::from_iter(merged_iter).unwrap();
+        // dbg!(&results);
+
+        assert_eq!(
+            results[0],
+            GenomicRangeRecord::new("chr1".to_string(), 0, 7, 9.2,)
+        );
+
+        assert_eq!(
+            results[1],
+            GenomicRangeRecord::new("chr1".to_string(), 10, 17, 10.1,)
+        );
+
+        assert_eq!(
+            results[2],
+            GenomicRangeRecord::new(
+                "chr2".to_string(),
+                10,
+                32,
+                4.800000000000001, // simplifies float comparison
+            )
+        );
+    }
+
+    #[test]
+    fn test_merging_empty_iterators() {
+        let iter = Bed3Iterator::new("tests_data/test_case_03.bed").unwrap();
+
+        let merged_iter = MergingEmptyResultIterator::new(iter, 0);
+
+        let results: Vec<_> = Result::from_iter(merged_iter).unwrap();
+
+        assert_eq!(
+            results[0],
+            GenomicRangeRecordEmpty::new("chr1".to_string(), 0, 7,)
+        );
+
+        assert_eq!(
+            results[1],
+            GenomicRangeRecordEmpty::new("chr1".to_string(), 10, 17,)
+        );
+
+        assert_eq!(
+            results[2],
+            GenomicRangeRecordEmpty::new("chr2".to_string(), 10, 32,)
+        );
+    }
+
+    #[test]
+    fn test_merging_empty_iterators_overlap_neg2() {
+        let iter = Bed3Iterator::new("tests_data/test_case_03.bed").unwrap();
+
+        // with -2, we require *at least two* overlapping basepairs.
+        // chr1 ranges: [0, 5), [4, 7) - these overlap by one, not merged; no others
+        // chr2 ranges: [10, 20), [18, 32) - these overlap by two, so merged
+        let merged_iter = MergingEmptyResultIterator::new(iter, -2);
+
+        let results: Vec<_> = Result::from_iter(merged_iter).unwrap();
+
+        assert_eq!(
+            results[0],
+            GenomicRangeRecordEmpty::new("chr1".to_string(), 0, 5,)
+        );
+
+        assert_eq!(
+            results[1],
+            GenomicRangeRecordEmpty::new("chr1".to_string(), 4, 7,)
+        );
+
+        assert_eq!(
+            results[2],
+            GenomicRangeRecordEmpty::new("chr1".to_string(), 10, 17,)
+        );
+
+        assert_eq!(
+            results[3],
+            GenomicRangeRecordEmpty::new("chr2".to_string(), 10, 32,)
+        );
+    }
+
+    #[test]
+    fn test_merging_empty_iterators_overlap_pos10() {
+        let iter = Bed3Iterator::new("tests_data/test_case_03.bed").unwrap();
+
+        // with 10, we should just have two range ranges.
+        let merged_iter = MergingEmptyResultIterator::new(iter, 10);
+
+        let results: Vec<_> = Result::from_iter(merged_iter).unwrap();
+
+        assert_eq!(
+            results[0],
+            GenomicRangeRecordEmpty::new("chr1".to_string(), 0, 17)
+        );
+
+        assert_eq!(
+            results[1],
+            GenomicRangeRecordEmpty::new("chr2".to_string(), 10, 32,)
+        );
+    }
+
+    #[test]
+    fn test_conditional_merging_iterators() {
+        let iter = Bed5Iterator::new("tests_data/test_case_03.bed").unwrap();
+
+        let sum_scores =
+            |data: Vec<Bed5Addition>| data.iter().map(|bed5| bed5.score.unwrap()).sum::<f64>();
+
+        fn group_features(
+            range_1: &GenomicRangeRecord<Bed5Addition>,
+            range_2: &GenomicRangeRecord<Bed5Addition>,
+        ) -> bool {
+            range_1.data.name == range_2.data.name
+        }
+
+        let merged_iter =
+            ConditionalMergingResultIterator::new(iter, 0, sum_scores, group_features);
+
+        let results: Vec<_> = Result::from_iter(merged_iter).unwrap();
+
+        // unlike unconditional merging iterators, the first ranges here,
+        // though book-ended, are *not* overlapped because they have different feature
+        // names
+        assert_eq!(
+            results[0],
+            GenomicRangeRecord::new("chr1".to_string(), 0, 5, 1.1,)
+        );
+
+        assert_eq!(
+            results[1],
+            GenomicRangeRecord::new("chr1".to_string(), 4, 7, 8.1,)
+        );
+
+        assert_eq!(
+            results[2],
+            GenomicRangeRecord::new("chr1".to_string(), 10, 17, 10.1,)
+        );
+
+        // these *are* merged because they have the same feature name
+        assert_eq!(
+            results[3],
+            GenomicRangeRecord::new(
+                "chr2".to_string(),
+                10,
+                32,
+                4.800000000000001, // simplifies float comparison
+            )
+        );
+    }
+}
diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs
index 4719b46..a4d4c1d 100644
--- a/src/ranges/mod.rs
+++ b/src/ranges/mod.rs
@@ -185,6 +185,20 @@ impl<U> GenomicRangeRecord<U> {
             data,
         }
     }
+
+    /// Consume this [`GenomicRangeRecord`], apply a function to its data
+    /// and return the new [`GenomicRangeRecord`] with different data.
+    pub fn into_map_data<F, V>(self, func: F) -> GenomicRangeRecord<V>
+    where
+        F: Fn(U) -> V,
+    {
+        GenomicRangeRecord {
+            seqname: self.seqname,
+            start: self.start,
+            end: self.end,
+            data: func(self.data),
+        }
+    }
 }
 
 impl<U: Clone> GenericRange for GenomicRangeRecord<U> {
@@ -249,6 +263,7 @@ impl<U: Clone> GenericRangeOperations for GenomicRangeRecord<U> {
 
 /// Represents a genomic range entry without data, e.g. from a BED3 parser.
 #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
+#[serde(deny_unknown_fields)]
 pub struct GenomicRangeRecordEmpty {
     pub seqname: String,
     pub start: Position,
@@ -504,4 +519,31 @@ mod tests {
         let range_b = RangeEmpty::new(2, 5);
         assert_eq!(range_a.overlap_width(&range_b), 3);
     }
+
+    #[test]
+    fn test_distance_or_overlap() {
+        // | 0 | 1 |   |   |
+        // |   |   |   |   | 4 | 5 |
+        let range_a = RangeEmpty::new(0, 2);
+        let range_b = RangeEmpty::new(4, 6);
+        assert_eq!(range_a.distance_or_overlap(&range_b), 2);
+
+        // | 0 | 1 |   |   |
+        // |   |   | 2 | 3 | 4 | 5 |
+        let range_a = RangeEmpty::new(0, 2);
+        let range_b = RangeEmpty::new(2, 6);
+        assert_eq!(range_a.distance_or_overlap(&range_b), 0);
+
+        // | 0 | 1 | 2 |   |
+        // |   |   | 2 | 3 | 4 |   |
+        let range_a = RangeEmpty::new(1, 3);
+        let range_b = RangeEmpty::new(2, 5);
+        assert_eq!(range_a.distance_or_overlap(&range_b), -1);
+
+        // |   | 1 | 2 | 3 | 4 | ...
+        // |   |   | 2 | 3 | 4 |   |
+        let range_a = RangeEmpty::new(1, 10);
+        let range_b = RangeEmpty::new(2, 5);
+        assert_eq!(range_a.distance_or_overlap(&range_b), -3);
+    }
 }
diff --git a/src/reporting.rs b/src/reporting.rs
index 3fd7353..1c09e5b 100644
--- a/src/reporting.rs
+++ b/src/reporting.rs
@@ -12,11 +12,11 @@
 #[allow(unused)]
 pub struct CommandOutput<U> {
     value: U,
-    report: Report,
+    report: Option<Report>,
 }
 
 impl<U> CommandOutput<U> {
-    pub fn new(value: U, report: Report) -> Self {
+    pub fn new(value: U, report: Option<Report>) -> Self {
         Self { value, report }
     }
 }
diff --git a/src/sequences/nucleotide.rs b/src/sequences/nucleotide.rs
index b476492..7bcee34 100644
--- a/src/sequences/nucleotide.rs
+++ b/src/sequences/nucleotide.rs
@@ -459,7 +459,6 @@ mod tests {
             .region_map(&get_len, "chr2", 0, *chr2_len)
             .unwrap();
         assert_eq!(total_len, *chr2_len as usize);
-        dbg!(&reference);
         assert!(reference.is_loaded("chr2"));
     }
 
diff --git a/src/test_utilities.rs b/src/test_utilities.rs
index 9a66f20..e8a7760 100644
--- a/src/test_utilities.rs
+++ b/src/test_utilities.rs
@@ -1,14 +1,14 @@
 //! Test cases and test utility functions.
 //!
 
-use std::path::PathBuf;
+use std::{io::BufRead, path::PathBuf};
 
 use crate::{
     commands::granges_random_bed,
     create_granges_with_seqlens,
     error::GRangesError,
     granges::GRangesEmpty,
-    io::parsers::bed::Bed5Addition,
+    io::{parsers::bed::Bed5Addition, InputStream},
     prelude::{GRanges, VecRangesIndexed},
     ranges::{
         coitrees::COITrees,
@@ -199,6 +199,18 @@ pub fn random_bed5file(length: usize) -> NamedTempFile {
     temp_bedfile
 }
 
+// "head" a file -- useful for peaking at temp files in failed tests.
+pub fn head_file(filepath: impl Into<PathBuf>) {
+    let filepath = filepath.into();
+    let file = InputStream::new(filepath);
+    let mut reader = file.reader().unwrap();
+    for _ in 0..10 {
+        let mut line = String::new();
+        reader.read_line(&mut line).unwrap();
+        dbg!(&line);
+    }
+}
+
 /// Range test case #1
 ///
 /// This also has a corresponding reference sequence
@@ -206,12 +218,12 @@ pub fn random_bed5file(length: usize) -> NamedTempFile {
 ///
 /// Ranges:
 ///   - chr1:
-///      (0, 5, Some(1.1))
-///      (4, 7, Some(8.1))
-///      (10, 17, Some(10.1))
+///      (0, 5, 1.1)
+///      (4, 7, 8.1)
+///      (10, 17, 10.1)
 ///   - chr2:
-///      (10, 20, Some(3.7))
-///      (18, 32, Some(1.1))
+///      (10, 20, 3.7)
+///      (18, 32, 1.1)
 ///
 /// Seqlens: { "chr1" => 30, "chr2" => 100 }
 ///
@@ -241,6 +253,27 @@ pub fn granges_test_case_02() -> GRanges<VecRangesIndexed, Vec<f64>> {
     }, seqlens: { "chr1" => 50, "chr2" => 300 })
 }
 
+/// Range test case #3 (modified from #1)
+///
+/// This is a test case with Bed5 data. This is to test
+/// merging iterators, with grouping by feature name.
+///
+/// This matches `tests_data/test_case_03.bed`.
+///
+// Seqlens: { "chr1" => 30, "chr2" => 100 }
+///
+/// Sum of all elements: 24.1
+///
+pub fn granges_test_case_03() -> GRanges<VecRangesIndexed, Vec<Bed5Addition>> {
+    create_granges_with_seqlens!(VecRangesIndexed, Vec<Bed5Addition>, {
+        "chr1" => [(0, 5, Bed5Addition { name: "a".to_string(), score: Some(1.1) }), 
+                   (4, 7, Bed5Addition { name: "b".to_string(), score: Some(8.1) }), 
+                   (10, 17, Bed5Addition { name: "c".to_string(), score: Some(10.1) })],
+        "chr2" => [(10, 20, Bed5Addition { name: "d".to_string(), score: Some(3.7)}), 
+                   (18, 32, Bed5Addition { name: "d".to_string(), score: Some(1.1) })]
+    }, seqlens: { "chr1" => 30, "chr2" => 100 })
+}
+
 #[cfg(feature = "ndarray")]
 /// Mock numeric `NumericSequences1` data
 pub fn random_array1_sequences(n: usize) -> GenomeMap<Array1<f64>> {
@@ -281,3 +314,33 @@ pub fn random_array2_sequences(n: usize) -> GenomeMap<Array2<f64>> {
     }
     seqs
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        granges::GRanges, io::Bed5Iterator, seqlens, test_utilities::granges_test_case_01,
+    };
+
+    #[test]
+    fn test_test_case_01() {
+        let genome = seqlens!("chr1" => 30, "chr2" => 100);
+        // yea we're testing test cases. yo dawg.
+        let iter = Bed5Iterator::new("tests_data/test_case_01.bed").unwrap();
+
+        // extract out the score, passing the errors through
+        // NOTE: this is because we don't have a try_fold iterator
+        // method setup yet -- we need that, and when we get it,
+        // we should adjust this
+        let iter_simplified = iter.map(|result| {
+            match result {
+                // this unwrap is for '.'
+                Ok(record) => Ok(record.into_map_data(|bed5_cols| bed5_cols.score.unwrap())),
+                Err(e) => Err(e),
+            }
+        });
+
+        // build a new GRanges
+        let gr = GRanges::from_iter(iter_simplified, &genome).unwrap();
+        assert_eq!(gr, granges_test_case_01());
+    }
+}
diff --git a/src/traits.rs b/src/traits.rs
index 8e87db3..a2d062b 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -16,7 +16,7 @@ use crate::{
     join::LeftGroupedJoin,
     prelude::VecRangesIndexed,
     ranges::GenomicRangeRecord,
-    Position,
+    Position, PositionOffset,
 };
 
 /// Traits for [`GRanges`] types that can be modified.
@@ -79,6 +79,27 @@ pub trait GenericRange: Clone {
         overlap_end.saturating_sub(overlap_start)
     }
 
+    fn distance_or_overlap<R: GenericRange>(&self, other: &R) -> PositionOffset {
+        if self.end() > other.start() && self.start() < other.end() {
+            // The ranges overlap -- return how much as a negative number
+            // TODO/OPTIMIZE: is this slow?
+            let overlap: PositionOffset = self.overlap_width(other).try_into().unwrap();
+            -overlap
+        } else if self.end() == other.start() || self.start() == other.end() {
+            // The ranges are bookends (adjacent to each other)
+            0
+        } else {
+            // The ranges do not overlap and are not bookends; calculate the distance
+            // If `self` is entirely before `other`, the distance is `other.start() - self.end()`.
+            // If `self` is entirely after `other`, the distance is `self.start() - other.end()`.
+            // Using `max` to ensure we always get the positive distance regardless of range order
+            std::cmp::max(
+                other.start().saturating_sub(self.end()),
+                self.start().saturating_sub(other.end()),
+            ) as PositionOffset
+        }
+    }
+
     /// Return a tuple of the range created by an overlap with another range; `None` if no overlap.
     fn overlap_range<R: GenericRange>(&self, other: &R) -> Option<(Position, Position)> {
         let overlap_start = std::cmp::max(self.start(), other.start());
@@ -97,6 +118,51 @@ pub trait GenericRange: Clone {
     }
 }
 
+/// The [`GenericGenomicRange`] extends sequence name comparison and related
+/// functionality to [`GenericRange`].
+// TODO: to do this right and have it be useful we need to push
+// the seqname indices lower into parsing, e.g. so this works for key types
+// like GenericGenomicRange.
+pub trait GenericGenomicRange: GenericRange {
+    /// Returns the sequence name index associated with the range
+    fn seqname_index(&self) -> usize;
+
+    /// Calculate how many basepairs overlap this range and another, considering seqname_index
+    fn genomic_overlap_width<R: GenericGenomicRange>(&self, other: &R) -> Position {
+        if self.seqname_index() != other.seqname_index() {
+            return 0; // No overlap if seqname_index is different
+        }
+
+        let overlap_start = std::cmp::max(self.start(), other.start());
+        let overlap_end = std::cmp::min(self.end(), other.end());
+
+        if overlap_start >= overlap_end {
+            0
+        } else {
+            overlap_end.saturating_sub(overlap_start)
+        }
+    }
+
+    /// Return a tuple of the range created by an overlap with another range, considering seqname_index
+    fn genomic_overlap_range<R: GenericGenomicRange>(
+        &self,
+        other: &R,
+    ) -> Option<(Position, Position)> {
+        if self.seqname_index() != other.seqname_index() {
+            return None; // No overlap if seqname_index is different
+        }
+
+        let overlap_start = std::cmp::max(self.start(), other.start());
+        let overlap_end = std::cmp::min(self.end(), other.end());
+
+        if overlap_start <= overlap_end {
+            Some((overlap_start, overlap_end))
+        } else {
+            None
+        }
+    }
+}
+
 /// The [`JoinDataOperations`] trait unifies common operations
 /// over combined join data types ([`CombinedJoinData`],
 /// [`CombinedJoinDataBothEmpty`], etc).
diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs
index 49ebf7c..9095184 100644
--- a/tests/bedtools_validation.rs
+++ b/tests/bedtools_validation.rs
@@ -2,20 +2,94 @@
 
 use granges::{
     commands::granges_random_bed,
-    io::{parsers::bed::bed_missing, InputStream},
+    io::parsers::bed::bed_missing,
     prelude::{read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator},
     ranges::GenomicRangeRecord,
     test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile},
+    Position,
 };
+use indexmap::IndexMap;
 use std::{
     fs::File,
-    io::BufRead,
     path::PathBuf,
     process::{Command, Stdio},
 };
 
 use serde::Deserialize;
 
+/// A macro to ensure that standard out is the same. In some cases
+/// this cannot be used, e.g. bedtools and granges have different
+/// output float precision, so validation in that case is more involved.
+/// Importantly, this also checks that standard out has a length > 0,
+/// which can be a sneaky edge case where output "matches" but is not
+/// valid since the comparison will pass spuriously.
+macro_rules! assert_stdout_eq {
+    ($left:expr, $right:expr) => {
+        let left_output = String::from_utf8_lossy(&$left.stdout);
+        let right_output = String::from_utf8_lossy(&$right.stdout);
+
+        // Check that lengths are greater than 0
+        assert!(
+            !left_output.is_empty() && !right_output.is_empty(),
+            "One or both outputs are empty"
+        );
+
+        // Check that the outputs are equal
+        assert_eq!(left_output, right_output, "Outputs are not equal");
+    };
+}
+
+fn validate_bedfloats(
+    bedtools_path: impl Into<PathBuf>,
+    granges_path: impl Into<PathBuf>,
+    genome: &IndexMap<String, Position>,
+    tol: f64,
+    context: Option<String>,
+) {
+    let bedtools_path = bedtools_path.into();
+    let granges_path = granges_path.into();
+    let bedtools_iter = BedlikeIterator::new(bedtools_path).unwrap();
+    let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap();
+
+    let granges_iter = BedlikeIterator::new(granges_path).unwrap();
+    let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap();
+
+    let granges_data = granges_gr.take_data().unwrap();
+    let granges_data = granges_data.iter().map(|extra_cols| {
+        // explicitly handle missing, then unwrap (rather than ok(), which is less safe)
+        if *extra_cols == Some(".".to_string()) {
+            return None;
+        }
+        let score: Result<f64, _> = extra_cols.as_ref().unwrap().parse();
+        // dbg!(&extra_cols);
+        Some(score.unwrap())
+    });
+
+    let bedtools_data = bedtools_gr.take_data().unwrap();
+    let bedtools_data = bedtools_data.iter().map(|extra_cols| {
+        if *extra_cols == Some(".".to_string()) {
+            return None;
+        }
+        let score: f64 = extra_cols.as_ref().unwrap().parse().unwrap();
+        Some(score)
+    });
+    assert_eq!(granges_data.len(), bedtools_data.len());
+
+    granges_data
+        .zip(bedtools_data)
+        .for_each(|(gr_val, bd_val)| match (gr_val, bd_val) {
+            (Some(gr), Some(bd)) => assert!((gr - bd).abs() < tol),
+            // NOTE: for some sum operations with no data,
+            // bedtools returns '.' not 0.0. The latter is more correct
+            // (the sum of the empty set is not NA, it's 0.0).
+            // This is a shim so tests don't stochastically break
+            // in this case.
+            (Some(n), None) if n == 0.0 => (),
+            (None, None) => (),
+            _ => panic!("{:?}", (&context, &gr_val, &bd_val)),
+        });
+}
+
 // adjustable test sizes -- useful also for making very small
 // so string diffs are easier to compare.
 #[cfg(not(feature = "bench-big"))]
@@ -23,18 +97,7 @@ const BED_LENGTH: usize = 10;
 #[cfg(feature = "bench-big")]
 const BED_LENGTH: usize = 1_000_000;
 
-// helpers
-
-fn head_file(filepath: impl Into<PathBuf>) {
-    let filepath = filepath.into();
-    let file = InputStream::new(&filepath);
-    let mut reader = file.reader().unwrap();
-    for _ in 0..10 {
-        let mut line = String::new();
-        reader.read_line(&mut line).unwrap();
-        dbg!(&line);
-    }
-}
+// -- some validation helpers
 
 #[macro_export]
 macro_rules! assert_float_tol {
@@ -73,6 +136,8 @@ macro_rules! assert_option_float_tol {
     }};
 }
 
+// -- main validation tests below --
+
 #[test]
 fn test_random_bed3file_filetype_detect() {
     let random_bedfile_path = temp_bedfile().path().to_path_buf();
@@ -86,7 +151,7 @@ fn test_random_bed3file_filetype_detect() {
     )
     .expect("could not generate random BED file");
 
-    head_file(&random_bedfile_path);
+    // head_file(&random_bedfile_path);
     match GenomicRangesFile::detect(random_bedfile_path).unwrap() {
         GenomicRangesFile::Bed3(_) => (),
         _ => panic!("<IN-TESTS>: could not detect correct filetype"),
@@ -134,10 +199,7 @@ fn test_against_bedtools_slop() {
     assert!(bedtools_output.status.success(), "{:?}", bedtools_output);
     assert!(granges_output.status.success(), "{:?}", granges_output);
 
-    assert_eq!(
-        String::from_utf8_lossy(&bedtools_output.stdout),
-        String::from_utf8_lossy(&granges_output.stdout)
-    );
+    assert_stdout_eq!(bedtools_output, granges_output);
 }
 
 /// Test bedtools intersect -a <left> -b <right> -wa -u
@@ -190,10 +252,7 @@ fn test_against_bedtools_intersect_wa() {
     assert!(bedtools_output.status.success(), "{:?}", bedtools_output);
     assert!(granges_output.status.success(), "{:?}", granges_output);
 
-    assert_eq!(
-        String::from_utf8_lossy(&bedtools_output.stdout),
-        String::from_utf8_lossy(&granges_output.stdout)
-    );
+    assert_stdout_eq!(bedtools_output, granges_output);
 }
 
 /// Test bedtools flank -g <genome> -i <input> -l 10 -r 20
@@ -245,6 +304,8 @@ fn test_against_bedtools_flank() {
 
     let bedtools_str = String::from_utf8_lossy(&bedtools_output.stdout);
     let granges_str = String::from_utf8_lossy(&granges_output.stdout);
+    assert!(bedtools_str.len() > 0);
+    assert!(granges_str.len() > 0);
 
     let mut bedtools_ranges: Vec<_> = bedtools_str.split("\n").collect();
     let mut granges_ranges: Vec<_> = granges_str.split("\n").collect();
@@ -287,11 +348,7 @@ fn test_against_bedtools_makewindows() {
 
             assert!(bedtools_output.status.success(), "{:?}", bedtools_output);
             assert!(granges_output.status.success(), "{:?}", granges_output);
-
-            assert_eq!(
-                String::from_utf8_lossy(&bedtools_output.stdout),
-                String::from_utf8_lossy(&granges_output.stdout)
-            );
+            assert_stdout_eq!(bedtools_output, granges_output);
         }
     }
 }
@@ -369,47 +426,13 @@ fn test_against_bedtools_map() {
         assert!(granges_output.status.success(), "{:?}", granges_output);
 
         let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap();
-
-        let bedtools_iter = BedlikeIterator::new(bedtools_path.path()).unwrap();
-        let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap();
-
-        let granges_iter = BedlikeIterator::new(granges_output_file.path().to_path_buf()).unwrap();
-        let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap();
-
-        let granges_data = granges_gr.take_data().unwrap();
-        let granges_data = granges_data.iter().map(|extra_cols| {
-            // explicitly handle missing, then unwrap (rather than ok(), which is less safe)
-            if *extra_cols == Some(".".to_string()) {
-                return None;
-            }
-            let score: Result<f64, _> = extra_cols.as_ref().unwrap().parse();
-            // dbg!(&extra_cols);
-            Some(score.unwrap())
-        });
-
-        let bedtools_data = bedtools_gr.take_data().unwrap();
-        let bedtools_data = bedtools_data.iter().map(|extra_cols| {
-            if *extra_cols == Some(".".to_string()) {
-                return None;
-            }
-            let score: f64 = extra_cols.as_ref().unwrap().parse().unwrap();
-            Some(score)
-        });
-        assert_eq!(granges_data.len(), bedtools_data.len());
-
-        granges_data
-            .zip(bedtools_data)
-            .for_each(|(gr_val, bd_val)| match (gr_val, bd_val) {
-                (Some(gr), Some(bd)) => assert!((gr - bd).abs() < 1e-5),
-                // NOTE: for some sum operations with no data,
-                // bedtools returns '.' not 0.0. The latter is more correct
-                // (the sum of the empty set is not NA, it's 0.0).
-                // This is a shim so tests don't stochastically break
-                // in this case.
-                (Some(n), None) if n == 0.0 => (),
-                (None, None) => (),
-                _ => panic!("{:?}", (&operation, &gr_val, &bd_val)),
-            });
+        validate_bedfloats(
+            bedtools_path.path(),
+            granges_output_file.path().to_path_buf(),
+            &genome,
+            1e-6,
+            format!("operation: {}", operation).into(),
+        );
     }
 }
 
@@ -497,7 +520,7 @@ fn test_against_bedtools_map_multiple() {
         median: Option<f64>,
     }
 
-    head_file(&bedtools_path.path());
+    // head_file(&bedtools_path.path());
     let bedtools_iter =
         TsvRecordIterator::<GenomicRangeRecord<Stats>>::new(bedtools_path.path()).expect("HERE");
     let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap();
@@ -506,6 +529,7 @@ fn test_against_bedtools_map_multiple() {
         granges_output_file.path().to_path_buf(),
     )
     .unwrap();
+
     let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap();
 
     let granges_data = granges_gr.take_data().unwrap();
@@ -527,3 +551,92 @@ fn test_against_bedtools_map_multiple() {
             assert_option_float_tol!(gr.mean, bd.mean);
         });
 }
+
+#[test]
+fn test_against_bedtools_merge_empty() {
+    let num_ranges = BED_LENGTH;
+    let random_bedfile_path = random_bed3file(num_ranges);
+    let distances = vec![0, 1, 10, 20];
+
+    for distance in distances {
+        let bedtools_output = Command::new("bedtools")
+            .arg("merge")
+            .arg("-i")
+            .arg(&random_bedfile_path.path())
+            .arg("-d")
+            .arg(distance.to_string())
+            .output()
+            .expect("bedtools merge failed");
+
+        let granges_output = Command::new(granges_binary_path())
+            .arg("merge")
+            .arg("--bedfile")
+            .arg(&random_bedfile_path.path())
+            .arg("-d")
+            .arg(distance.to_string())
+            .output()
+            .expect("granges merge failed");
+
+        assert!(bedtools_output.status.success(), "{:?}", bedtools_output);
+        assert!(granges_output.status.success(), "{:?}", granges_output);
+        assert_stdout_eq!(bedtools_output, granges_output);
+    }
+}
+
+#[test]
+fn test_against_bedtools_merge_map() {
+    let num_ranges = BED_LENGTH;
+    let bedscores_file = random_bed5file(num_ranges);
+    let distances = vec![0, 1, 10, 20];
+
+    let operations = vec!["sum", "min", "max", "mean", "median"];
+
+    for operation in operations {
+        for distance in &distances {
+            let bedtools_path = temp_bedfile();
+            let bedtools_output_file = File::create(&bedtools_path).unwrap();
+
+            let bedtools_output = Command::new("bedtools")
+                .arg("merge")
+                .arg("-i")
+                .arg(&bedscores_file.path())
+                .arg("-d")
+                .arg(distance.to_string())
+                .arg("-c")
+                .arg("5") // the score column
+                .arg("-o")
+                .arg(operation)
+                .stdout(Stdio::from(bedtools_output_file))
+                .output()
+                .expect("bedtools merge failed");
+
+            let granges_output_file = temp_bedfile();
+            let granges_output = Command::new(granges_binary_path())
+                .arg("merge")
+                .arg("--bedfile")
+                .arg(&bedscores_file.path())
+                .arg("-d")
+                .arg(distance.to_string())
+                .arg("--func")
+                .arg(operation)
+                .arg("--output")
+                .arg(granges_output_file.path())
+                .output()
+                .expect("granges merge failed");
+
+            assert!(bedtools_output.status.success(), "{:?}", bedtools_output);
+            // head_file(granges_output_file.path());
+            assert!(granges_output.status.success(), "{:?}", granges_output);
+            // head_file(bedtools_path.path());
+
+            let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap();
+            validate_bedfloats(
+                bedtools_path.path(),
+                granges_output_file.path().to_path_buf(),
+                &genome,
+                1e-6,
+                format!("operation: {}, distance: {}", operation, distance).into(),
+            );
+        }
+    }
+}
diff --git a/tests_data/example_bedlike.tsv b/tests_data/example_bedlike.tsv
index 28891f9..fd3a664 100644
--- a/tests_data/example_bedlike.tsv
+++ b/tests_data/example_bedlike.tsv
@@ -1,5 +1,5 @@
-chr1	10	20	1.23
-chr1	14	18	3.10
-chr2	4	5	4.11
-chr2	5	6	0.13
-chr4	9	15	0.34
+chr1	10	20	1.23	1.2	10.1
+chr1	14	18	3.10	3.1	0.1
+chr2	4	5	4.11	2.1	1.1
+chr2	5	6	0.13	0.8	8.1
+chr4	9	15	0.35	0.01	0.0
diff --git a/tests_data/test_case_01.bed b/tests_data/test_case_01.bed
new file mode 100644
index 0000000..48e1016
--- /dev/null
+++ b/tests_data/test_case_01.bed
@@ -0,0 +1,5 @@
+chr1	0	5	a	1.1
+chr1	4	7	b	8.1
+chr1	10	17	c	10.1
+chr2	10	20	d	3.7
+chr2	18	32	e	1.1
diff --git a/tests_data/test_case_03.bed b/tests_data/test_case_03.bed
new file mode 100644
index 0000000..d7350b5
--- /dev/null
+++ b/tests_data/test_case_03.bed
@@ -0,0 +1,5 @@
+chr1	0	5	a	1.1
+chr1	4	7	b	8.1
+chr1	10	17	c	10.1
+chr2	10	20	d	3.7
+chr2	18	32	d	1.1