From 548fbd0e46703603ef9232463e91d075d7983398 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Wed, 14 Feb 2024 21:16:12 -0800
Subject: [PATCH] Basic iterator trait, range pushing methods, to coitrees, and
 more!

 - tests and test utilies
 - rough join stuff
 - interval/range conversion methods
 - new GRanges methods refined
 - iteration methods
 - read_seqlens
 - PathBuf-based arguments in io
 - lazy BED parsing
 - invalid BED tests
 - adjust range operation with tests
 - clippy & fmt
 - GitHub Rust workflow added
---
 .github/workflows/rust.yml     |  22 +++
 Cargo.toml                     |  28 ++-
 src/data/mod.rs                |   3 +
 src/data/ndarray.rs            |  62 ++++++
 src/data/vec.rs                |  33 ++++
 src/error.rs                   |  24 ++-
 src/granges.rs                 | 177 +++++++++++++----
 src/io/io.rs                   | 191 ++++++++++++++++++
 src/io/mod.rs                  |   9 +
 src/io/noodles.rs              |  12 ++
 src/io/parsers.rs              | 344 +++++++++++++++++++++++++++++++++
 src/iterators.rs               |  78 ++++++++
 src/join.rs                    |  43 +++++
 src/lib.rs                     |  52 ++++-
 src/main/commands.rs           |  22 +++
 src/main/mod.rs                |  71 +++++++
 src/ranges/coitrees.rs         | 154 ++++++++++++---
 src/ranges/mod.rs              | 126 +++++++-----
 src/ranges/operations.rs       |  54 ++++++
 src/ranges/vec.rs              |  35 +++-
 src/test_utilities.rs          |  39 +++-
 src/traits.rs                  |  77 ++++++++
 tests_data/example.bed         |   5 +
 tests_data/invalid.bed         |   5 +
 tests_data/noodles_example.bed |   2 +
 25 files changed, 1529 insertions(+), 139 deletions(-)
 create mode 100644 .github/workflows/rust.yml
 create mode 100644 src/data/mod.rs
 create mode 100644 src/data/ndarray.rs
 create mode 100644 src/data/vec.rs
 create mode 100644 src/io/io.rs
 create mode 100644 src/io/mod.rs
 create mode 100644 src/io/noodles.rs
 create mode 100644 src/io/parsers.rs
 create mode 100644 src/iterators.rs
 create mode 100644 src/main/commands.rs
 create mode 100644 src/main/mod.rs
 create mode 100644 src/ranges/operations.rs
 create mode 100644 tests_data/example.bed
 create mode 100644 tests_data/invalid.bed
 create mode 100644 tests_data/noodles_example.bed

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
new file mode 100644
index 0000000..31000a2
--- /dev/null
+++ b/.github/workflows/rust.yml
@@ -0,0 +1,22 @@
+name: Rust
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Build
+      run: cargo build --verbose
+    - name: Run tests
+      run: cargo test --verbose
diff --git a/Cargo.toml b/Cargo.toml
index d69a2d1..0e635ac 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,13 +1,33 @@
 [package]
-name = "granges2"
+name = "granges"
 version = "0.1.0"
 edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+license = "MIT"
+authors = ["Vince Buffalo <vsbuffalo@gmail.com>"]
+keywords = ["genomics", "bioinformatics", "compbio"]
+categories = ["science"]
+documentation = "https://docs.rs/granges/"
+repository = "https://github.com/vsbuffalo/granges"
+description = "A Rust library and command line tool for genomic range operations."
 
 [dependencies]
+# clap = { version = "4.4.18", features = ["derive"], optional = true }
+clap = { version = "4.4.18", features = ["derive"] }
 coitrees = { version = "0.4.0", features = ["nosimd"] }
-genomap = "0.1.5"
+flate2 = "1.0.28"
+genomap = "0.2.6"
 indexmap = "2.2.3"
+ndarray = "0.15.6"
+noodles = { version = "0.63.0", features = ["core", "bed"] }
 rand = "0.8.5"
 thiserror = "1.0.57"
+
+# [features]
+# cli = [ "clap" ]
+
+[[bin]]
+name = "granges"
+path = "src/main/mod.rs"
+# required-features = ["cli"]
+
+
diff --git a/src/data/mod.rs b/src/data/mod.rs
new file mode 100644
index 0000000..a2966e8
--- /dev/null
+++ b/src/data/mod.rs
@@ -0,0 +1,3 @@
+//! Data container implementations.
+
+pub mod vec;
diff --git a/src/data/ndarray.rs b/src/data/ndarray.rs
new file mode 100644
index 0000000..2885f34
--- /dev/null
+++ b/src/data/ndarray.rs
@@ -0,0 +1,62 @@
+//! Data container implementations for [`ndarray::Array1`] and [`ndarray::Array2`].
+
+use ndarray::{Array1, Array2, ArrayView1};
+use crate::traits::IndexedDataContainer;
+
+impl<'a, U> IndexedDataContainer<'a> for Array1<U>
+where
+    U: Copy + Default + 'a,
+{
+    type Item = U;
+    type Output = Array1<U>;
+
+    fn get_value(&'a self, index: usize) -> Self::Item {
+        self[index]
+    }
+
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn is_valid_index(&self, index: usize) -> bool {
+        index < self.shape()[0]
+    }
+
+    fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+        Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
+    }
+}
+
+impl<'a, U> IndexedDataContainer<'a> for Array2<U>
+where
+    U: Copy + Default + 'a,
+{
+    type Item = ArrayView1<'a, U>;
+    type Output = Array2<U>;
+
+    fn get_value(&'a self, index: usize) -> Self::Item {
+        self.row(index)
+    }
+
+    fn len(&self) -> usize {
+        self.shape()[0]
+    }
+
+    fn is_valid_index(&self, index: usize) -> bool {
+        index < self.shape()[0]
+    }
+
+    fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+        let cols = self.shape()[1];
+
+        let rows_data: Vec<U> = indices
+            .iter()
+            .flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
+            .collect();
+
+        // create a new Array2<U> from the rows
+        // shape is (number of indices, number of columns)
+        Array2::from_shape_vec((indices.len(), cols), rows_data)
+            .expect("Shape and collected data size mismatch")
+    }
+}
diff --git a/src/data/vec.rs b/src/data/vec.rs
new file mode 100644
index 0000000..a794fe4
--- /dev/null
+++ b/src/data/vec.rs
@@ -0,0 +1,33 @@
+//! Data container implementations for [`Vec<U>`].
+
+
+/// Trait methods for the commonly-used `Vec<U>` data container.
+///
+/// Note that the associated `Item` type is always a *reference* to the data elements.
+impl<'a, U> IndexedDataContainer<'a> for Vec<U>
+where
+    U: Clone + 'a,
+{
+    type Item = &'a U;
+    type Output = Vec<U>;
+
+    fn get_value(&'a self, index: usize) -> Self::Item {
+        self.get(index).unwrap()
+    }
+
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn is_valid_index(&self, index: usize) -> bool {
+        self.get(index).is_some()
+    }
+
+    fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
+        Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
+    }
+}
+
+
+
+
diff --git a/src/error.rs b/src/error.rs
index 9f636f1..7feabc3 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -1,13 +1,33 @@
+use std::num::{ParseIntError, ParseFloatError};
+
+use genomap::GenomeMapError;
 use thiserror::Error;
 
 use crate::Position;
 
 #[derive(Debug, Error)]
 pub enum GRangesError {
+    // IO related errors
+    #[error("File reading eror: {0}")]
+    IOError(#[from] std::io::Error),
+
+    // File parsing related errors
+    #[error("Integer parsing error: {0}")]
+    ParseIntError(#[from] ParseIntError),
+    #[error("Float parsing error: {0}")]
+    ParseFloatError(#[from] ParseFloatError),
+    #[error("Bed-like file has too few columns. The first three columns must be sequence name, and start and end positions.\nLine: {0}")]
+    BedlikeTooFewColumns(String),
+    #[error("File has invalid column type entry: {0}")]
+    InvalidColumnType(String),
+
+    // Invalid genomic range errors
     #[error("Range invalid: start ({0}) must be greater than end ({1})")]
     InvalidGenomicRange(Position, Position),
-
     #[error("Range [{0}, {1}] is invalid for sequence of length {2}")]
     InvalidGenomicRangeForSequence(Position, Position, Position),
-
+    #[error("Sequence name '{0}' is not the ranges container")]
+    MissingSequence(String),
+    #[error("Error encountered in genomap::GenomeMap")]
+    GenomeMapError(#[from] GenomeMapError),
 }
diff --git a/src/granges.rs b/src/granges.rs
index 13c9365..e6b679a 100644
--- a/src/granges.rs
+++ b/src/granges.rs
@@ -1,17 +1,27 @@
 use genomap::GenomeMap;
-
-use crate::{traits::RangeContainer, ranges::{vec::{VecRanges, VecRangesIndexed, VecRangesEmpty}, RangeIndexed, RangeEmpty}, Position};
-
-
+use indexmap::IndexMap;
+
+use crate::{
+    prelude::GRangesError,
+    ranges::{
+        coitrees::{COITrees, COITreesIndexed},
+        vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
+        RangeEmpty, RangeIndexed, RangeRecord,
+    },
+    traits::{RangeContainer, RangesIterable},
+    Position, iterators::GRangesIterator,
+};
+
+#[derive(Clone, Debug)]
 pub struct GRanges<C, T> {
-    ranges: GenomeMap<C>,
-    data: Option<T>,
+    pub(crate) ranges: GenomeMap<C>,
+    pub(crate) data: Option<T>,
 }
 
-
 impl<C, T> GRanges<C, T>
-where C: RangeContainer {
-    
+where
+    C: RangeContainer,
+{
     /// Get the total number of ranges.
     pub fn len(&self) -> usize {
         self.ranges.values().map(|ranges| ranges.len()).sum()
@@ -21,28 +31,50 @@ where C: RangeContainer {
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
-}
 
-impl<U> GRanges<VecRangesIndexed, Vec<U>> {
+    /// Get the raw range container.
+    pub fn get_ranges(&self, seqname: &str) -> Option<&C> {
+        self.ranges.get(seqname)
+    }
 
+    /// Get the sequence names.
+    pub fn seqnames(&self) -> Vec<String> {
+        self.ranges.names()
+    }
+}
+
+impl<R: Clone, T> GRanges<VecRanges<R>, T> {
     /// Create a new [`GRanges`] object, with vector storage for ranges and data.
     ///
     /// This combination of range and data containers is used when loading data into
-    /// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's 
-    /// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`] 
+    /// a new [`GRanges`] object, and the size cannot be known beforehand. Rust's
+    /// [`Vec`] will dynamically grow to accommodate new ranges; use [`GRanges.shrink()`]
     /// call the [`Vec`]'s shrink to size methods on the range and data containers
     /// after data loading to shrink to the minimal necessary size (this can reduce
     /// memory usage).
-    pub fn new_vec() -> Self {
-        let ranges = GenomeMap::new();
-        Self {
-            ranges,
-            data: None,
+    pub fn new_vec(seqlens: &IndexMap<String, Position>) -> Self {
+        let mut ranges = GenomeMap::new();
+        for (seqname, length) in seqlens.iter() {
+            // this should never happen because the error is only if
+            // insert encounters a seqname that's already been inserted -- that
+            // cannot happen here.
+            ranges
+                .insert(seqname, VecRanges::new(*length))
+                .expect("Internal error: please report");
         }
+        Self { ranges, data: None }
     }
+}
 
-
-    pub fn push_range_with_data(&mut self, seqname: &str, start: Position, end: Position, data: U) {
+impl<U> GRanges<VecRangesIndexed, Vec<U>> {
+    /// Push a genomic range with its data to the range and data containers in a [`GRanges] object.
+    pub fn push_range_with_data(
+        &mut self,
+        seqname: &str,
+        start: Position,
+        end: Position,
+        data: U,
+    ) -> Result<(), GRangesError> {
         // push data to the vec data container, getting the index
         let index: usize = {
             let data_container = self.data.get_or_insert_with(Vec::new);
@@ -51,39 +83,108 @@ impl<U> GRanges<VecRangesIndexed, Vec<U>> {
         };
         // push an indexed range
         let range = RangeIndexed::new(start, end, index);
-        self.ranges.entry_or_default(seqname).ranges.push(range);
+        let range_container = self
+            .ranges
+            .get_mut(seqname)
+            .ok_or(GRangesError::MissingSequence(seqname.to_string()))?;
+        range_container.push_range(range);
+        Ok(())
     }
 }
 
 impl GRanges<VecRangesEmpty, ()> {
+    /// Push an empty range (no data) to the [`VecRangesEmpty`] range container.
+    pub fn push_range_empty(
+        &mut self,
+        seqname: &str,
+        start: Position,
+        end: Position,
+    ) -> Result<(), GRangesError> {
+        // push an unindexed (empty) range
+        let range = RangeEmpty::new(start, end);
+        let range_container = self
+            .ranges
+            .get_mut(seqname)
+            .ok_or(GRangesError::MissingSequence(seqname.to_string()))?;
+        range_container.push_range(range);
+        Ok(())
+    }
+}
 
-    /// Create a new [`GRanges`] object, with vector storage for ranges and no data container.
-    pub fn new_vec_empty() -> Self {
-        let ranges = GenomeMap::new();
-        Self {
-            ranges,
-            data: None,
+impl<U> GRanges<VecRangesIndexed, Vec<U>> {
+    pub fn from_iter<I>(
+        iter: I,
+        seqlens: IndexMap<String, Position>,
+    ) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError>
+    where
+        I: Iterator<Item = Result<RangeRecord<U>, GRangesError>>,
+    {
+        let mut gr = GRanges::new_vec(&seqlens);
+        for possible_entry in iter {
+            let entry = possible_entry?;
+            gr.push_range_with_data(&entry.seqname, entry.start, entry.end, entry.data)?;
         }
+        Ok(gr)
     }
+}
 
-    /// Push an empty range (no data) to the [`VecRangesEmpty`] range container.
-    pub fn push_range_only(&mut self, seqname: &str, start: Position, end: Position) {
-        // push an unindexed (empty) range
-        let range = RangeEmpty::new(start, end);
-        self.ranges.entry_or_default(seqname).ranges.push(range);
+impl GRanges<VecRangesEmpty, ()> {
+    pub fn from_iter_empty<I>(
+        iter: I,
+        seqlens: IndexMap<String, Position>,
+    ) -> Result<GRanges<VecRangesEmpty, ()>, GRangesError>
+    where
+        I: Iterator<Item = Result<RangeRecord<()>, GRangesError>>,
+    {
+        let mut gr = GRanges::new_vec(&seqlens);
+        for possible_entry in iter {
+            let entry = possible_entry?;
+            gr.push_range_empty(&entry.seqname, entry.start, entry.end)?;
+        }
+        Ok(gr)
     }
 }
 
+impl<T> GRanges<VecRangesIndexed, T> {
+    /// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree  
+    /// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library
+    /// by Daniel C. Jones.
+    pub fn to_coitrees(self) -> Result<GRanges<COITreesIndexed, T>, GRangesError> {
+        let old_ranges = self.ranges;
+        let mut new_ranges = GenomeMap::new();
+        for (seqname, vec_ranges) in old_ranges.into_iter() {
+            let trees = COITrees::from(vec_ranges);
+            new_ranges.insert(&seqname, trees)?;
+        }
+        Ok(GRanges {
+            ranges: new_ranges,
+            data: self.data,
+        })
+    }
+}
 
+impl<R, T> GRanges<R, T> 
+where R: RangesIterable<RangeIndexed> {
+    /// Create a new [`GRangesIterator`] to iterate through all the ranges in this [`GRanges`] object.
+    pub fn iter_ranges<'a>(&'a self) -> GRangesIterator<'a, R> {
+        GRangesIterator::new(&self.ranges)
+    }
+}
 
 #[cfg(test)]
 mod tests {
-    use crate::{prelude::*, test_utilities::random_vecranges};
+    use indexmap::indexmap;
+
+    use crate::{
+        prelude::*,
+        test_utilities::{granges_test_case_01, random_vecranges},
+    };
 
     #[test]
     fn test_new_vec() {
-        let mut gr = GRanges::new_vec();
-        gr.push_range_with_data("chr1", 0, 10, 1.1);
+        let seqlens = indexmap! { "chr1".to_string() => 10};
+        let mut gr = GRanges::new_vec(&seqlens);
+        gr.push_range_with_data("chr1", 0, 10, 1.1).unwrap();
         assert_eq!(gr.len(), 1);
     }
 
@@ -93,4 +194,10 @@ mod tests {
         assert_eq!(vr.len(), 100)
     }
 
+    #[test]
+    fn test_to_coitrees() {
+        let gr_vec = granges_test_case_01();
+        let gr = gr_vec.clone().to_coitrees().unwrap();
+        assert_eq!(gr.len(), 5);
+    }
 }
diff --git a/src/io/io.rs b/src/io/io.rs
new file mode 100644
index 0000000..f0d62a2
--- /dev/null
+++ b/src/io/io.rs
@@ -0,0 +1,191 @@
+//! Input/Output file handling with [`InputFile`] and [`OutputFile`].
+//!
+//! These types abstract over reading/writing both plaintext and gzip-compressed
+//! input/output.
+
+use flate2::read::GzDecoder;
+use flate2::write::GzEncoder;
+use flate2::Compression;
+use indexmap::IndexMap;
+use std::fs::File;
+use std::io::Write;
+use std::io::{self, BufWriter};
+use std::io::{BufRead, BufReader, Read};
+use std::path::PathBuf;
+
+use crate::Position;
+use crate::error::GRangesError;
+
+/// Read a tab-delimited *genome file* of sequence (i.e. chromosome) names and their lengths.
+pub fn read_seqlens(filepath: impl Into<PathBuf>) -> Result<IndexMap<String, Position>, GRangesError> {
+    let input_file = InputFile::new(filepath);
+    let reader = input_file.reader()?;
+
+    let mut seqlens = IndexMap::new();
+    for result in reader.lines() {
+        let line = result?;
+        let mut columns = line.split('\t');
+        let seqname = columns.next().unwrap();
+        let length: Position = columns.next().unwrap().parse()?;
+        seqlens.insert(seqname.to_string(), length);
+    }
+    Ok(seqlens)
+}
+
+/// Check if a file is a gzipped by looking for the magic numbers
+fn is_gzipped_file(file_path: impl Into<PathBuf>) -> io::Result<bool> {
+    let mut file = File::open(file_path.into())?;
+    let mut buffer = [0; 2];
+    file.read_exact(&mut buffer)?;
+
+    Ok(buffer == [0x1f, 0x8b])
+}
+
+/// Represents an input file.
+///
+/// This struct is used to handle operations on an input file, such as reading from the file.
+/// This abstracts how data is read in, allowing for both plaintext and gzip-compressed input
+/// to be read through a common interface.
+pub struct InputFile {
+    pub filepath: PathBuf,
+    pub comments: Option<Vec<String>>,
+    pub header: Option<String>,
+    pub skip_lines: usize,
+}
+
+impl InputFile {
+    /// Constructs a new `InputFile`.
+    ///
+    /// # Arguments
+    ///
+    /// * `filepath` - A string slice that holds the path to the file. If the file extension is
+    /// `.gz`, `InputFile` will automatically uncompress the input.
+    pub fn new(filepath: impl Into<PathBuf>) -> Self {
+        Self {
+            filepath: filepath.into(),
+            comments: None,
+            header: None,
+            skip_lines: 0,
+        }
+    }
+
+    /// Opens the file and returns a buffered reader.
+    ///
+    /// If the file is gzip-compressed (indicated by a ".gz" extension), this method will
+    /// automatically handle the decompression.
+    ///
+    /// # Returns
+    ///
+    /// A result containing a `BufReader<Box<dyn Read>>` on success, or a `FileError` on failure.
+    ///
+    pub fn reader(&self) -> io::Result<BufReader<Box<dyn Read>>> {
+        let file = File::open(self.filepath.clone())?;
+        //let is_gzipped_name = self.filepath.ends_with(".gz");
+        let is_gzipped = is_gzipped_file(&self.filepath)?;
+        let reader: Box<dyn Read> = if is_gzipped {
+            Box::new(GzDecoder::new(file))
+        } else {
+            Box::new(file)
+        };
+        Ok(BufReader::new(reader))
+    }
+
+    /// Collects comment lines and/or a line at the start of the file.
+    pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result<bool> {
+        let mut buf_reader = self.reader()?;
+        let mut comments = Vec::new();
+        let mut line = String::new();
+
+        while buf_reader.read_line(&mut line)? > 0 {
+            if line.starts_with(comment) {
+                comments.push(line.trim_end().to_string());
+                self.skip_lines += 1;
+            } else if let Some(header_string) = header {
+                if line.starts_with(header_string) {
+                    self.header = Some(line.trim_end().to_string());
+                    self.skip_lines += 1;
+                    // We only handle one header line. If there are more, the
+                    // file is *very* poorly formatted. So just let downstream
+                    // parsing errors catch this. In the future, we could have a specialized
+                    // error.
+                    break;
+                }
+                // break on the first non-header/comment line
+                break;
+            }
+            line.clear();
+        }
+
+        self.comments = Some(comments);
+        Ok(self.skip_lines > 0)
+    }
+
+    /// Method to continue reading after skipping the comment and header lines.
+    pub fn continue_reading(&self) -> io::Result<BufReader<Box<dyn Read>>> {
+        let mut buf_reader = self.reader()?;
+        let mut skipped_lines = 0;
+        let mut line = String::new();
+
+        // skip the lines that were previously read as comments or header
+        while skipped_lines < self.skip_lines {
+            buf_reader.read_line(&mut line)?;
+            skipped_lines += 1;
+            line.clear();
+        }
+        Ok(buf_reader)
+    }
+}
+
+/// Represents an output file.
+///
+/// This struct is used to handle operations on an output file, such as writing to the file.
+/// This abstracts writing both plaintext and gzip-compressed files.
+pub struct OutputFile {
+    pub filepath: PathBuf,
+    pub header: Option<Vec<String>>,
+}
+
+impl OutputFile {
+    /// Constructs a new `OutputFile`.
+    ///
+    /// # Arguments
+    ///
+    /// * `filepath` - A string slice that holds the path to the file. If the file extension is
+    /// `.gz`, `OutputFile` will automatically write gzip-compressed output.
+    /// * `header` - An optional vector of strings representing commented header lines to be written to the file.
+    pub fn new(filepath: impl Into<PathBuf>, header: Option<Vec<String>>) -> Self {
+        Self {
+            filepath: filepath.into(),
+            header,
+        }
+    }
+
+    /// Opens the file and returns a writer.
+    ///
+    /// If the file path ends with ".gz", the file is treated as gzip-compressed, and the
+    /// function will handle compression automatically. If a header is set, it will be written
+    /// to the file.
+    ///
+    /// # Returns
+    ///
+    /// A result containing a `Box<dyn Write>` on success, or an `io::Error` on failure.
+    pub fn writer(&self) -> io::Result<Box<dyn Write>> {
+        let outfile = &self.filepath;
+        let is_gzip = outfile.ends_with(".gz");
+        let mut writer: Box<dyn Write> = if is_gzip {
+            Box::new(BufWriter::new(GzEncoder::new(
+                File::create(outfile)?,
+                Compression::default(),
+            )))
+        } else {
+            Box::new(BufWriter::new(File::create(outfile)?))
+        };
+        // write header if one is set
+        if let Some(entries) = &self.header {
+            for entry in entries {
+                writeln!(writer, "#{}", entry)?;
+            }
+        }
+        Ok(writer)
+    }
+}
diff --git a/src/io/mod.rs b/src/io/mod.rs
new file mode 100644
index 0000000..10df12c
--- /dev/null
+++ b/src/io/mod.rs
@@ -0,0 +1,9 @@
+//! Input/Output
+//!
+
+pub mod io;
+pub mod noodles;
+pub mod parsers;
+
+pub use io::{InputFile, OutputFile};
+pub use parsers::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator};
diff --git a/src/io/noodles.rs b/src/io/noodles.rs
new file mode 100644
index 0000000..242e308
--- /dev/null
+++ b/src/io/noodles.rs
@@ -0,0 +1,12 @@
+//!
+
+use crate::Position;
+use noodles::core::Position as NoodlesPosition;
+
+/// Convert from [`noodles::core::Position`], which is [1-based
+/// indexing](https://docs.rs/noodles-core/latest/noodles_core/position/struct.Position.html)
+/// to GRanges internal 0-based indexing.
+pub fn convert_noodles_position(value: NoodlesPosition) -> Position {
+    let one_based_position: Position = value.get().try_into().unwrap();
+    one_based_position - 1
+}
diff --git a/src/io/parsers.rs b/src/io/parsers.rs
new file mode 100644
index 0000000..574f0af
--- /dev/null
+++ b/src/io/parsers.rs
@@ -0,0 +1,344 @@
+//! Functionality for general parsing, by turning BED-like files into iterators.
+//!
+
+use noodles::bed::{self};
+use std::collections::HashSet;
+use std::io::{BufRead, BufReader, Read};
+use std::path::PathBuf;
+
+use crate::Position;
+use crate::error::GRangesError;
+use crate::io::io::InputFile;
+use crate::ranges::RangeRecord;
+use crate::traits::GeneralRangeRecordIterator;
+
+use super::noodles::convert_noodles_position;
+
+pub struct Bed3RecordIterator {
+    bed_reader: bed::Reader<BufReader<Box<dyn Read>>>,
+}
+
+impl Bed3RecordIterator {
+    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
+        let input_file = InputFile::new(filepath);
+        let reader = input_file.continue_reading()?;
+        let bed_reader = bed::Reader::new(reader);
+        Ok(Self { bed_reader })
+    }
+
+    pub fn from_reader(reader: BufReader<Box<dyn std::io::Read>>) -> Self {
+        let bed_reader = bed::Reader::new(reader);
+        Self { bed_reader }
+    }
+}
+
+impl Iterator for Bed3RecordIterator {
+    type Item = Result<RangeRecord<()>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.bed_reader.records::<3>().next().map(|res| {
+            res.map_err(GRangesError::IOError)
+                .map(|record| RangeRecord {
+                    seqname: record.reference_sequence_name().to_string(),
+                    start: convert_noodles_position(record.start_position()),
+                    end: convert_noodles_position(record.end_position()),
+                    data: (),
+                })
+        })
+    }
+}
+
+impl GeneralRangeRecordIterator<()> for Bed3RecordIterator {
+    fn retain_seqnames(self, seqnames: Vec<String>) -> FilteredIntervals<Self, ()> {
+        FilteredIntervals::new(self, Some(seqnames), None)
+    }
+
+    fn exclude_seqnames(self, seqnames: Vec<String>) -> FilteredIntervals<Self, ()> {
+        FilteredIntervals::new(self, None, Some(seqnames))
+    }
+}
+
+pub struct TsvRecordIterator<F, U> {
+    reader: BufReader<Box<dyn std::io::Read>>,
+    parser: F,
+    phantom: std::marker::PhantomData<U>,
+}
+
+impl<F, U> TsvRecordIterator<F, U>
+where
+    F: Fn(&str) -> Result<RangeRecord<U>, GRangesError>,
+{
+    pub fn new(filepath: impl Into<PathBuf>, parser: F) -> Result<Self, GRangesError> {
+        let input_file = InputFile::new(filepath);
+        let reader = input_file.continue_reading()?;
+
+        Ok(Self {
+            reader,
+            parser,
+            phantom: std::marker::PhantomData,
+        })
+    }
+}
+
+impl<F, U> Iterator for TsvRecordIterator<F, U>
+where
+    F: Fn(&str) -> Result<RangeRecord<U>, GRangesError>,
+{
+    type Item = Result<RangeRecord<U>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut line = String::new();
+        match self.reader.read_line(&mut line) {
+            Ok(0) => None,
+            Ok(_) => {
+                let line = line.trim_end();
+                Some((self.parser)(line))
+            }
+            Err(e) => Some(Err(GRangesError::IOError(e))),
+        }
+    }
+}
+
+/// A BED-like file parser. This works by lazy-parsing the first three
+/// columns, which are standard to all BED files.
+pub struct BedlikeIterator {
+    iter: TsvRecordIterator<fn(&str) -> Result<RangeRecord<String>, GRangesError>, String>,
+}
+
+impl BedlikeIterator {
+    pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
+        // Wrap the parse_bedlike_to_range_record function to conform with TsvRecordIterator's expectations.
+        let parser: fn(&str) -> Result<RangeRecord<String>, GRangesError> = parse_bed_lazy;
+        
+        let iter = TsvRecordIterator::new(filepath, parser)?;
+        Ok(Self { iter })
+    }
+}
+
+impl Iterator for BedlikeIterator {
+    type Item = Result<RangeRecord<String>, GRangesError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
+
+
+
+/// An iterator over [`IntervalRecord`] items that filters based on sequence name.
+///
+/// Note that that the exclude filter is prioritized over the retain filter. So,
+/// if a pipeline contains both, then if a chromosome is supplied to
+/// [`GeneralIntervalIterator.exclude_seqnames()`], then even if it is in retain,
+/// it is skipped. This is to prevent validation and returning a [`Result`].
+///
+/// # Example
+///
+/// ```
+/// use granges::prelude::*;
+/// use noodles::bed;
+///
+/// let iter = Bed3RecordIterator::new("tests_data/example.bed")
+///            .expect("error reading file")
+///            .exclude_seqnames(vec!["chr1".to_string()]);
+///
+/// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 };
+/// let gr = GRanges::from_iter(iter, seqlens)
+///            .expect("parsing error");
+/// let mut iter = gr.iter_ranges();
+///
+/// // the first range should be the third range in the file,
+/// // chr2:4-5
+/// assert_eq!(iter.next().unwrap().start, 4);
+/// assert_eq!(iter.next().unwrap().end, 5);
+/// ```
+pub struct FilteredIntervals<I, U>
+where
+    I: Iterator<Item = Result<RangeRecord<U>, GRangesError>>,
+{
+    inner: I,
+    retain_seqnames: Option<HashSet<String>>,
+    exclude_seqnames: Option<HashSet<String>>,
+}
+
+impl<I, U> FilteredIntervals<I, U>
+where
+    I: Iterator<Item = Result<RangeRecord<U>, GRangesError>>,
+{
+    pub fn new(
+        inner: I,
+        retain_seqnames: Option<Vec<String>>,
+        exclude_seqnames: Option<Vec<String>>,
+    ) -> Self {
+        let retain_seqnames = retain_seqnames.map(HashSet::from_iter);
+        let exclude_seqnames = exclude_seqnames.map(HashSet::from_iter);
+        Self {
+            inner,
+            retain_seqnames,
+            exclude_seqnames,
+        }
+    }
+}
+
+impl<I, U> Iterator for FilteredIntervals<I, U>
+where
+    I: Iterator<Item = Result<RangeRecord<U>, GRangesError>>,
+{
+    type Item = Result<RangeRecord<U>, GRangesError>;
+
+    /// Get the next filtered entry, prioritizing exclude over retain.
+    fn next(&mut self) -> Option<Self::Item> {
+        for item in self.inner.by_ref() {
+            match &item {
+                Ok(entry) => {
+                    if self
+                        .exclude_seqnames
+                        .as_ref()
+                        .map_or(false, |ex| ex.contains(&entry.seqname))
+                    {
+                        continue;
+                    }
+                    if self
+                        .retain_seqnames
+                        .as_ref()
+                        .map_or(true, |rt| rt.contains(&entry.seqname))
+                    {
+                        return Some(item);
+                    }
+                }
+                Err(_) => return Some(item),
+            }
+        }
+        None
+    }
+}
+
+/// Parses a single column from a string slice into a specified type.
+///
+/// This function is particularly useful for converting columns in genomic data files
+/// from string representation to a specific type like `usize`, `f64`, etc.
+///
+/// # Type Parameters
+///
+/// * `T`: The type to which the column should be parsed. It must implement `FromStr`.
+///
+/// # Arguments
+///
+/// * `column`: The string slice representing the column to be parsed.
+/// * `line`: The entire line from which the column was extracted. Used for error reporting.
+///
+/// # Errors
+///
+/// Returns `GRangesError::InvalidColumnType` if the column cannot be parsed into type `T`.
+pub fn parse_column<T: std::str::FromStr>(column: &str, line: &str) -> Result<T, GRangesError>
+where
+    <T as std::str::FromStr>::Err: std::fmt::Debug,
+{
+    // NOTE: this is used a lot, and should be benchmarked.
+    column
+        .parse::<T>()
+        .map_err(|_| GRangesError::InvalidColumnType(format!("column '{}' in '{}'", column, line)))
+}
+
+/// Parses a BED-like format line into its constituent components.
+///
+/// BED-like formats contain BED3 columns (chromosome, start, and end) and
+/// others. It's an non-standard but extensible (since it's just a TSV), and sensible
+/// for a variety of genomic data.
+///
+/// # Arguments
+///
+/// * `line`: A string slice representing a line in BED3 format.
+///
+/// # Errors
+///
+/// Returns `GRangesError::InvalidColumnType` if the line does not have enough columns
+/// or if any column cannot be correctly parsed.
+pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str>), GRangesError> {
+    let columns: Vec<&str> = line.split('\t').collect();
+    if columns.len() < 3 {
+        return Err(GRangesError::BedlikeTooFewColumns(line.to_string()));
+    }
+
+    let seqname = parse_column(columns[0], line)?;
+    let start: Position = parse_column(columns[1], line)?;
+    let end: Position = parse_column(columns[2], line)?;
+
+    // Collect remaining columns
+    let additional_columns = columns[3..].to_vec();
+
+    Ok((seqname, start, end, additional_columns))
+}
+
+/// Lazily parses a BED* format line into the first three columns defining the range,
+/// storing the rest as a `String`.
+pub fn parse_bed_lazy(line: &str) -> Result<RangeRecord<String>, GRangesError> {
+    let columns: Vec<&str> = line.splitn(4, '\t').collect();
+    if columns.len() < 3 {
+        return Err(GRangesError::BedlikeTooFewColumns(line.to_string()));
+    }
+
+    let seqname = parse_column(columns[0], line)?;
+    let start: Position = parse_column(columns[1], line)?;
+    let end: Position = parse_column(columns[2], line)?;
+
+    let data = columns[3].to_string();
+
+    Ok(RangeRecord {
+        seqname,
+        start,
+        end,
+        data,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::prelude::*;
+
+    use super::BedlikeIterator;
+
+    #[test]
+    fn test_parser() {
+        // based on this example: https://docs.rs/noodles-bed/latest/noodles_bed/struct.Reader.html#method.records
+        let iter = Bed3RecordIterator::new("tests_data/noodles_example.bed").unwrap();
+
+        let seqlens = seqlens! { "sq0" => 10 };
+        let gr: GRanges<VecRangesEmpty, _> = GRanges::from_iter_empty(iter, seqlens).unwrap();
+
+        assert_eq!(gr.len(), 2);
+
+        // let mut gr_iter = gr.iter_ranges();
+        // let first_interval = gr_iter.next().unwrap();
+        // assert_eq!(first_interval.first, 7);
+        // assert_eq!(first_interval.last, 12);
+        // 
+        // let second_interval = gr_iter.next().unwrap();
+        // assert_eq!(second_interval.first, 20);
+        // assert_eq!(second_interval.last, 33);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_invalid_bed_noodles() {
+        let iter = Bed3RecordIterator::new("tests_data/invalid.bed").unwrap();
+
+        let seqlens = seqlens! { "sq0" => 10 };
+        let _gr: GRanges<VecRangesEmpty, _> = GRanges::from_iter_empty(iter, seqlens).unwrap();
+    }
+
+    #[test]
+    fn test_invalid_bedlike_iterator() {
+        let iter = BedlikeIterator::new("tests_data/invalid.bed").unwrap();
+        let seqlens = seqlens! { "sq0" => 10 };
+        let result: Result<GRanges<VecRangesIndexed, _>, _> = GRanges::from_iter(iter, seqlens);
+        dbg!(&result);
+        // note: the Rust LSP thinks this isn't used for some reason, so prefaced with _
+        // to silence warnings.
+        let _msg = "column '-1' in 'chr1\t-1\t20'".to_string();
+        assert!(matches!(
+                result,
+                Err(GRangesError::InvalidColumnType(_msg))
+                ));
+    }
+}
diff --git a/src/iterators.rs b/src/iterators.rs
new file mode 100644
index 0000000..d032d78
--- /dev/null
+++ b/src/iterators.rs
@@ -0,0 +1,78 @@
+use genomap::GenomeMap;
+
+use crate::{
+    ranges::{RangeIndexed, RangeIndexedRecord},
+    traits::{RangeContainer, RangesIterable},
+};
+
+/// An iterator over [`RangeIndexedRecord`], which store
+/// indices to the sequence names and data container.
+///
+/// # Developer Notes
+/// Using indices, rather than references to data items and `&str` directly,
+/// prevents lifetime complexity.
+pub struct GRangesIterator<'a, R> {
+    ranges: &'a GenomeMap<R>,
+    current_seqname_index: usize,
+    current_range_iter: Box<dyn Iterator<Item = RangeIndexed> + 'a>,
+}
+
+impl<'a, R> GRangesIterator<'a, R>
+where
+    R: RangesIterable<RangeIndexed>,
+{
+    pub fn new(ranges: &'a GenomeMap<R>) -> Self {
+        let current_range_iter = ranges.get_by_index(0).unwrap().iter_ranges();
+        Self {
+            ranges,
+            current_seqname_index: 0,
+            current_range_iter,
+        }
+    }
+}
+
+impl<'a, R> Iterator for GRangesIterator<'a, R>
+where
+    R: RangeContainer + RangesIterable<RangeIndexed>,
+{
+    type Item = RangeIndexedRecord;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some(next_range) = self.current_range_iter.next() {
+                return Some(RangeIndexedRecord {
+                    seqname_index: self.current_seqname_index,
+                    start: next_range.start,
+                    end: next_range.end,
+                    index: next_range.index,
+                });
+            } else {
+                // try to load another sequence's set of ranges.
+                self.current_seqname_index += 1;
+                if self.current_seqname_index >= self.ranges.len() {
+                    // we're out of range container iterators
+                    return None;
+                }
+                self.current_range_iter = self
+                    .ranges
+                    .get_by_index(self.current_seqname_index)
+                    .unwrap()
+                    .iter_ranges();
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{ranges::RangeIndexedRecord, test_utilities::granges_test_case_01};
+
+    use super::GRangesIterator;
+
+    #[test]
+    fn test_genomic_ranges_iterator() {
+        let gr = granges_test_case_01();
+        let mut iter = GRangesIterator::new(&gr.ranges);
+        assert_eq!(iter.next().unwrap(), RangeIndexedRecord::new(0, 0, 5, 0));
+    }
+}
diff --git a/src/join.rs b/src/join.rs
index 139597f..a381a51 100644
--- a/src/join.rs
+++ b/src/join.rs
@@ -1,2 +1,45 @@
+use std::rc::Rc;
 
+use crate::Position;
 
+pub struct JoinData<DL, DR> {
+    /// The data index for the left range.
+    left: usize,
+
+    /// A `Vec` of the indices for the overlapping right ranges.
+    rights: Vec<usize>,
+
+    /// The length of the left range.
+    left_length: Position,
+
+    /// The lengths of the right ranges.
+    right_lengths: Vec<Position>,
+
+    /// The lengths of the overlaps between the left and right ranges.
+    overlaps: Vec<Position>,
+
+    // TODO: we may want some simple summary of whether something is
+    // up or downstream. I think the cleanest summary is a signed integer
+    // representing the side and degree of non-overlap. E.g. a range
+    // that overlaps another but overhangs the 3' side of the focal left
+    // range by 10bp is +10; if it were 5', it would be -10.
+    /// A possible reference to the left data of type `T`.
+    left_data: Option<Rc<DL>>,
+
+    /// A possible reference to the right data of type `T`.
+    right_data: Option<Rc<DR>>,
+}
+
+pub struct JoinIterator<DL, DR> {
+    left_data: Option<Rc<DL>>,
+    right_data: Option<Rc<DR>>,
+}
+//
+// impl<DL, DR> JoinIterator<DL, DR> {
+//
+// }
+//
+//
+// pub fn left_join<CL, CR, DL, DR>(left: GRanges<CL, DL>, right: GRanges<CR, DR>) -> JoinIterator<DL, DR> {
+//
+// }
diff --git a/src/lib.rs b/src/lib.rs
index cb1be7a..5620e6e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,13 +1,57 @@
+// Copyright (2024) Vince Buffalo
+#![crate_name = "granges"]
+#![doc(html_root_url = "https://docs.rs/granges/")]
+
+pub use indexmap;
 
-pub mod traits;
-pub mod ranges;
-pub mod granges;
 pub mod error;
+pub mod granges;
+pub mod io;
+pub mod iterators;
+pub mod join;
+pub mod ranges;
 pub mod test_utilities;
+pub mod traits;
 
 pub type Position = u32;
+pub type PositionOffset = i32; // signed variant
 
 pub mod prelude {
-    pub use crate::granges::GRanges;
     pub use crate::error::GRangesError;
+    pub use crate::granges::GRanges;
+    pub use crate::io::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator};
+
+    pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
+    pub use crate::traits::{GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable};
+
+    pub use crate::seqlens;
+}
+
+/// Create and [`indexmap::IndexMap`] of sequence names and their lengths.
+#[macro_export]
+macro_rules! seqlens {
+    ($($key:expr => $value:expr),* $(,)?) => {
+        $crate::indexmap::indexmap!($($key.to_string() => $value),*)
+    };
+}
+
+/// Create a new `GRanges<R, T>` with sequence length information (used primarily for small examples)
+#[macro_export]
+macro_rules! create_granges_with_seqlens {
+    ($range_type:ty, $data_type:ty, { $($chr:expr => [$(($start:expr, $end:expr, $data:expr)),*]),* }, seqlens: { $($chr_len:expr => $len:expr),* }) => {
+        {
+            let mut seqlens = ::indexmap::IndexMap::new();
+            $(seqlens.insert($chr_len.to_string(), $len);)*
+
+            let mut gr: GRanges<$range_type, $data_type> = GRanges::new_vec(&seqlens);
+
+            $(
+                $(
+                    gr.push_range_with_data(&$chr.to_string(), $start, $end, $data).expect("Failed to push range");
+                )*
+            )*
+
+            gr
+        }
+    };
 }
diff --git a/src/main/commands.rs b/src/main/commands.rs
new file mode 100644
index 0000000..a71e774
--- /dev/null
+++ b/src/main/commands.rs
@@ -0,0 +1,22 @@
+use std::path::PathBuf;
+
+use granges::io::io::read_seqlens;
+use granges::prelude::*;
+
+
+
+/// Adjust the genomic ranges in a bedfile by some specified amount.
+// NOTE: we don't do build the full GRanges objects here, for efficiency.
+// But it would be a good benchmark to see how much slower that is.
+pub fn granges_adjust(bedfile: &PathBuf, seqlens: &PathBuf, both: &i32, output: Option<&PathBuf>) -> Result<(), GRangesError> {
+    let genome = read_seqlens(seqlens)?;
+
+    let bedlike_iterator = BedlikeIterator::new(bedfile)?;
+
+    for record in bedlike_iterator {
+        let range = record?;
+        let seqname = range.seqname;
+        let length = *genome.get(&seqname).ok_or(GRangesError::MissingSequence(seqname))?;
+    }
+    Ok(())
+}
diff --git a/src/main/mod.rs b/src/main/mod.rs
new file mode 100644
index 0000000..c944c02
--- /dev/null
+++ b/src/main/mod.rs
@@ -0,0 +1,71 @@
+use std::path::PathBuf;
+
+use clap::{Parser, Subcommand};
+use granges::prelude::GRangesError;
+
+pub mod commands;
+use crate::commands::granges_adjust;
+
+const INFO: &str = "\
+granges: genomic range operations built off of the GRanges library
+usage: granges [--help] <subcommand>
+
+Subcommands:
+  
+  adjust: adjust each genomic range, e.g. to add a kilobase to each end.
+ 
+";
+
+#[derive(Parser)]
+#[clap(name = "granges")]
+#[clap(about = INFO)]
+struct Cli {
+    #[arg(short, long, action = clap::ArgAction::Count)]
+    debug: u8,
+
+    #[command(subcommand)]
+    command: Option<Commands>,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    Adjust {
+        /// a TSV genome file of chromosome names and their lengths
+        #[arg(long, required = true)]
+        seqlens: PathBuf,
+        /// an input BED-like TSV file
+        #[arg(required = true)]
+        bedfile: PathBuf,
+        /// number of basepairs to expand the range start and end positions by
+        #[arg(long)]
+        both: i32,
+        /// an optional output file (standard output will be used if not specified)
+        output: Option<PathBuf>,
+    },
+}
+
+fn run() -> Result<(), GRangesError> {
+    let cli = Cli::parse();
+    match &cli.command {
+        Some(Commands::Adjust {
+            bedfile,
+            seqlens,
+            both,
+            output,
+        }) => granges_adjust(bedfile, seqlens, both, output.as_ref()),
+        None => {
+            println!("{}\n", INFO);
+            std::process::exit(1);
+        }
+    }
+}
+
+fn main() {
+    match run() {
+        Ok(_) => {}
+        Err(e) => {
+            eprintln!("Error: {}", e);
+            std::process::exit(1);
+        }
+    }
+}
diff --git a/src/ranges/coitrees.rs b/src/ranges/coitrees.rs
index 2e9d041..78aa8ad 100644
--- a/src/ranges/coitrees.rs
+++ b/src/ranges/coitrees.rs
@@ -1,49 +1,75 @@
-use coitrees::{Interval, BasicCOITree, IntervalTree, IntervalNode, GenericInterval};
+use coitrees::{BasicCOITree, GenericInterval, Interval, IntervalNode, IntervalTree};
 
-use crate::{Position, traits::RangeContainer, error::GRangesError};
+use crate::{error::GRangesError, traits::{RangeContainer}, traits::RangesIterable, Position};
 
-use super::{vec::VecRanges, RangeIndexed, validate_range};
+use super::{validate_range, vec::VecRanges, RangeEmpty, RangeIndexed};
 
-type COITreeIntervalIndexed = Interval<usize>;
+pub type COITreesIndexed = COITrees<usize>;
+
+impl GenericInterval<()> for RangeEmpty {
+    fn first(&self) -> i32 {
+        self.start.try_into().unwrap()
+    }
+    fn last(&self) -> i32 {
+        self.end.try_into().unwrap()
+    }
+    fn metadata(&self) -> &() {
+        &()
+    }
+}
 
 impl GenericInterval<usize> for RangeIndexed {
     fn first(&self) -> i32 {
-        self.start().try_into().unwrap()
+        self.start.try_into().unwrap()
     }
     fn last(&self) -> i32 {
-        self.end().try_into().unwrap()
+        self.end.try_into().unwrap()
     }
     fn metadata(&self) -> &usize {
-       self.index()
+        &self.index
     }
 }
 
 /// A [`coitrees::BasicCOITree`] interval tree for a single sequence's ranges.
 ///
-/// This is generic over the interval type, to handle the case where one 
-/// may want to do overlap operations on ranges without associated data in 
+/// This is generic over the interval type, to handle the case where one
+/// may want to do overlap operations on ranges without associated data in
 /// a data container (e.g. ranges that just define megabase windwows).
-pub struct COITreeRangeContainer<R: Clone> {
-    ranges: BasicCOITree<R, usize>,
+///
+pub struct COITrees<M: Clone> {
+    pub(crate) ranges: BasicCOITree<M, usize>,
     /// The sequence length, used to validate new ranges.
-    length: Position,
+    pub length: Position,
 }
 
-impl<R: Clone> COITreeRangeContainer<R> {
-    pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> {
-        let range = start..end;
-        validate_range(&range, self.length)
+impl<M: Clone> std::fmt::Debug for COITrees<M> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("COITrees")
+            .field("number of ranges:", &self.ranges.len()) 
+            .field("length", &self.length) 
+            .finish()
     }
+}
 
-    pub fn query<F>(&self, start: Position, end: Position, visit: F) 
-    where F: FnMut(&IntervalNode<R, usize>) {
-        // Note the terminology change to match coitrees (and uses i32s)
-        let first = start.try_into().expect("could not covert");
-        let end: i32  = end.try_into().expect("could not covert");
-        // internally coitrees uses 0-indexed, right-inclusive "last"
-        self.ranges.query(first, end - 1, visit)
+impl<M: Clone> COITrees<M> {
+    /// Validate a range, raising an error if it is invalid for some reason.
+    pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> {
+        validate_range(start, end, self.length)
     }
 
+    /// Query this range container for a particular range, and call a visit function on all
+    /// overlapping ranges.
+    pub fn query<F>(&self, start: Position, end: Position, visit: F)
+        where
+        F: FnMut(&IntervalNode<M, usize>),
+        {
+            // Note the terminology change to match coitrees (and uses i32s)
+            let first = start.try_into().expect("could not covert");
+            let end: i32 = end.try_into().expect("could not covert");
+            // internally coitrees uses 0-indexed, right-inclusive "last"
+            self.ranges.query(first, end - 1, visit)
+        }
+
     /// Return the number of ranges in this [`COITreeRangeContainer`] container.
     pub fn len(&self) -> usize {
         self.ranges.len()
@@ -53,22 +79,92 @@ impl<R: Clone> COITreeRangeContainer<R> {
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
-
 }
 
-impl<R: Clone + GenericInterval<R>> From<VecRanges<R>> for COITreeRangeContainer<R> {
+/// Convert a [`VecRanges`] range container to a [`COITrees`] range container.
+impl<R: Clone + GenericInterval<M>, M: Clone> From<VecRanges<R>> for COITrees<M> {
     fn from(value: VecRanges<R>) -> Self {
         let ranges = BasicCOITree::new(&value.ranges);
         let length = value.length;
-        Self {
-            ranges,
-            length
+        Self { ranges, length }
+    }
+}
+
+/// Convert a [`COITrees`] range container to a [`VecRanges`] range container.
+impl From<COITrees<usize>> for VecRanges<RangeIndexed> {
+    fn from(value: COITrees<usize>) -> Self {
+        let length = value.length;
+        let mut ranges: VecRanges<RangeIndexed> = VecRanges::new(length);
+        for interval in value.ranges.iter() {
+            ranges.push_range(interval.into());
         }
+        ranges
     }
 }
 
-impl<R: Clone> RangeContainer for COITreeRangeContainer<R> {
+/// [`RangeContainer`] trait implementations.
+impl<R: Clone> RangeContainer for COITrees<R> {
     fn len(&self) -> usize {
         self.ranges.len()
     }
 }
+
+/// Convert between [`coitrees::Interval`] with index metadata to a [`RangeEmpty`].
+impl From<Interval<&()>> for RangeEmpty {
+    fn from(value: Interval<&()>) -> Self {
+        RangeEmpty {
+            start: value.first.try_into().unwrap(),
+            end: value.last.try_into().unwrap(),
+        }
+    }
+}
+
+/// Convert between [`coitrees::Interval`] with index metadata to a [`RangeIndexed`].
+impl From<Interval<&usize>> for RangeIndexed {
+    fn from(value: Interval<&usize>) -> Self {
+        RangeIndexed {
+            start: value.first.try_into().unwrap(),
+            end: value.last.try_into().unwrap(),
+            index: *value.metadata(),
+        }
+    }
+}
+
+/// # Developer Notes
+///
+/// Internally, the [`coitrees`] iterator is over their interval type.
+/// Their iterator does not consume, but we need an owned (or copyable,
+/// as is case here) type to map out. Thus, we need this odd variant of
+/// an iterator that doesn't return references and does not consume.
+impl RangesIterable<RangeIndexed> for COITrees<usize> {
+    fn iter_ranges(&self) -> Box<dyn Iterator<Item = RangeIndexed> + '_> {
+        let iter = self.ranges.iter();
+        let converted_iter = iter.map(|interval| RangeIndexed::from(interval));
+        Box::new(converted_iter)
+    }
+}
+
+impl RangesIterable<RangeEmpty> for COITrees<()> {
+    fn iter_ranges(&self) -> Box<dyn Iterator<Item = RangeEmpty> + '_> {
+        let iter = self.ranges.iter();
+        let converted_iter = iter.map(|interval| RangeEmpty::from(interval));
+        Box::new(converted_iter)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::prelude::*;
+    use crate::ranges::RangeIndexed;
+    use crate::test_utilities::granges_test_case_01;
+
+    #[test]
+    fn test_ranges_iterable_coitrees() {
+        let gr = granges_test_case_01().to_coitrees().unwrap();
+        let mut chr1_iter = gr.get_ranges("chr1").unwrap().iter_ranges();
+        assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(0, 5, 0));
+        assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(4, 7, 1));
+        assert_eq!(chr1_iter.next().unwrap(), RangeIndexed::new(10, 17, 2));
+        assert!(chr1_iter.next().is_none());
+    }
+}
diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs
index 1e37bcf..d04742b 100644
--- a/src/ranges/mod.rs
+++ b/src/ranges/mod.rs
@@ -1,64 +1,74 @@
-use std::ops::Range;
+//! Range and Range Containers.
+//!
+//!
 
-use crate::{Position, error::GRangesError};
+use crate::{error::GRangesError, Position};
 
 pub mod coitrees;
 pub mod vec;
+pub mod operations;
 
-
-#[derive(Clone, Default)]
+#[derive(Clone, Debug, Default, PartialEq)]
 pub struct RangeEmpty {
-    range: Range<Position>,
+    pub start: Position,
+    pub end: Position,
 }
 
+unsafe impl Sync for RangeEmpty {}
+unsafe impl Send for RangeEmpty {}
+
 impl RangeEmpty {
     /// Create a new 0-indexed right-exclusive range.
     pub fn new(start: Position, end: Position) -> Self {
-        let start = start.try_into().unwrap();
-        let end = end.try_into().unwrap();
-        Self {
-            range: start..end,
-        }
-    }
-
-    pub fn start(&self) -> Position {
-        self.range.start
-    }
-
-    pub fn end(&self) -> Position {
-        self.range.end
+        Self { start, end }
     }
 }
 
-#[derive(Clone, Debug, Default)]
+/// [`RangeIndexed`] is a range with a valid
+/// index to a data element in the data container.
+#[derive(Clone, Debug, Default, PartialEq)]
 pub struct RangeIndexed {
-    range: Range<Position>,
-    index: usize,
+    pub start: Position,
+    pub end: Position,
+    pub index: usize,
 }
 
+unsafe impl Sync for RangeIndexed {}
+unsafe impl Send for RangeIndexed {}
+
 impl RangeIndexed {
     /// Create a new 0-indexed right-exclusive range.
     pub fn new(start: Position, end: Position, index: usize) -> Self {
-        let start = start.try_into().unwrap();
-        let end = end.try_into().unwrap();
-        Self {
-            range: start..end,
-            index
-        }
+        Self { start, end, index }
     }
+}
 
-    pub fn start(&self) -> Position {
-        self.range.start
-    }
+/// Represents a parsed range entry, possibly containing some data.
+#[derive(Debug, Clone, PartialEq)]
+pub struct RangeRecord<U> {
+    pub seqname: String,
+    pub start: Position,
+    pub end: Position,
+    pub data: U,
+}
 
-    pub fn end(&self) -> Position {
-        self.range.end
-    }
+/// Represents a range entry, possible with indices to sequence name and data.
+#[derive(Debug, Clone, PartialEq)]
+pub struct RangeIndexedRecord {
+    pub seqname_index: usize,
+    pub start: Position,
+    pub end: Position,
+    pub index: usize,
+}
 
-    // Note: this returning a reference is required to 
-    // implement coitrees's GenericInterval trait.
-    pub fn index(&self) -> &usize {
-        &self.index
+impl RangeIndexedRecord {
+    pub fn new(seqname_index: usize, start: Position, end: Position, index: usize) -> Self {
+        Self {
+            seqname_index,
+            start,
+            end,
+            index,
+        }
     }
 }
 
@@ -72,33 +82,49 @@ impl RangeIndexed {
 /// # Returns
 ///
 /// * `bool` - `true` if the range is valid for the sequence; otherwise, `false`.
-pub fn validate_range(range: &std::ops::Range<Position>, length: Position) -> 
-Result<(), GRangesError> {
-    let start = range.start;
-    let end = range.start;
-    dbg!(&start);
-    dbg!(&end);
+pub fn validate_range(
+    start: Position,
+    end: Position,
+    length: Position,
+) -> Result<(), GRangesError> {
     if start > end {
-        GRangesError::InvalidGenomicRange(start, end);
+        return Err(GRangesError::InvalidGenomicRange(start, end));
     }
 
     if end >= length {
-        GRangesError::InvalidGenomicRangeForSequence(start, end, length);
+        return Err(GRangesError::InvalidGenomicRangeForSequence(
+            start, end, length,
+        ));
     }
     Ok(())
 }
 
 #[cfg(test)]
 mod tests {
-    use crate::prelude::*;
     use super::validate_range;
+    use crate::prelude::*;
 
     #[test]
     fn test_invalid_range_start_end() {
-        let range = 10..1;
-        let result = validate_range(&range, 10);
-        dbg!(&range);
-        assert!(matches!(result, Err(GRangesError::InvalidGenomicRange(10, 0))));
+        let result = validate_range(5, 1, 10);
+        assert!(matches!(
+            result,
+            Err(GRangesError::InvalidGenomicRange(5, 1))
+        ));
+    }
+
+    #[test]
+    fn test_valid_range_length() {
+        let result = validate_range(1, 10, 11);
+        assert!(result.is_ok());
+    }
 
+    #[test]
+    fn test_invalid_range_length() {
+        let result = validate_range(1, 10, 10);
+        assert!(matches!(
+            result,
+            Err(GRangesError::InvalidGenomicRangeForSequence(1, 10, 10))
+        ));
     }
 }
diff --git a/src/ranges/operations.rs b/src/ranges/operations.rs
new file mode 100644
index 0000000..1c7feb7
--- /dev/null
+++ b/src/ranges/operations.rs
@@ -0,0 +1,54 @@
+//! Range Operations.
+//!
+//! - [`adjust()`]: Adjust range start and end positions.
+
+use crate::{PositionOffset, Position};
+
+use super::RangeIndexed;
+
+/// Adjusts the coordinates of a range, ensuring the adjusted range is within [0, length]
+/// and returning `None` if the range has zero width after adjustment.
+pub fn adjust(range: RangeIndexed, start_delta: PositionOffset, end_delta: PositionOffset, length: Position) -> Option<RangeIndexed> {
+    let start: PositionOffset = range.start.try_into().unwrap();
+    let end: PositionOffset = range.end.try_into().unwrap();
+    let length: PositionOffset = length.try_into().unwrap();
+
+    // ensure within [0, length]
+    let new_start = (start + start_delta).max(0).min(length);
+    // ensure new_end >= new_start and within [0, length]
+    let new_end = (end + end_delta).max(new_start).min(length); 
+
+    // check for zero-width range
+    if new_end <= new_start {
+        // return None if the range has zero width
+        None
+    } else {
+        Some(RangeIndexed::new(new_start.try_into().unwrap(), new_end.try_into().unwrap(), range.index))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_normal_adjustment() {
+        let range = RangeIndexed::new(5, 10, 1);
+        let adjusted = adjust(range, -2, 3, 15).unwrap();
+        assert_eq!(adjusted, RangeIndexed::new(3, 13, 1));
+    }
+
+    #[test]
+    fn test_out_of_bounds_adjustment() {
+        let range = RangeIndexed::new(10, 12, 2);
+        let adjusted = adjust(range, -5, 20, 15).unwrap();
+        assert_eq!(adjusted, RangeIndexed::new(5, 15, 2));
+    }
+    
+    #[test]
+    fn test_zero_width_result() {
+        let range = RangeIndexed::new(5, 10, 3);
+        assert!(adjust(range, 5, -5, 15).is_none());
+    }
+}
+
diff --git a/src/ranges/vec.rs b/src/ranges/vec.rs
index efc8026..0f0dea2 100644
--- a/src/ranges/vec.rs
+++ b/src/ranges/vec.rs
@@ -1,21 +1,17 @@
-use crate::{traits::RangeContainer, Position, error::GRangesError};
+use super::{validate_range, RangeEmpty, RangeIndexed};
+use crate::traits::{RangesIntoIterable, RangesIterable};
+use crate::{error::GRangesError, traits::RangeContainer, Position};
 
-use super::{RangeIndexed, validate_range, RangeEmpty};
 pub type VecRangesIndexed = VecRanges<RangeIndexed>;
 pub type VecRangesEmpty = VecRanges<RangeEmpty>;
 
-#[derive(Clone, Default)]
+#[derive(Clone, Debug)]
 pub struct VecRanges<R: Clone> {
-    pub (crate) ranges: Vec<R>,
+    pub(crate) ranges: Vec<R>,
     pub length: Position,
 }
 
 impl<R: Clone> VecRanges<R> {
-    pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> {
-        let range = start..end;
-        validate_range(&range, self.length)
-    }
-
     /// Create a new empty [`VecRanges`] container.
     pub fn new(length: Position) -> Self {
         Self {
@@ -24,6 +20,11 @@ impl<R: Clone> VecRanges<R> {
         }
     }
 
+    /// Validate a range, raising an error if it is invalid for some reason.
+    pub fn validate_range(&self, start: Position, end: Position) -> Result<(), GRangesError> {
+        validate_range(start, end, self.length)
+    }
+
     /// Add a new range to the [`VecRanges`] container.
     pub fn push_range(&mut self, range: R) {
         self.ranges.push(range)
@@ -45,3 +46,19 @@ impl<R: Clone> RangeContainer for VecRanges<R> {
         self.ranges.len()
     }
 }
+
+impl RangesIntoIterable<RangeIndexed> for VecRanges<RangeIndexed> {
+    fn into_iter_ranges(self) -> Box<dyn Iterator<Item = RangeIndexed>> {
+        let iter = self.ranges.into_iter();
+        Box::new(iter)
+    }
+}
+
+impl RangesIterable<RangeIndexed> for VecRanges<RangeIndexed> {
+    fn iter_ranges(&self) -> Box<dyn Iterator<Item = RangeIndexed> + '_> {
+        let iter = self.ranges.iter();
+        // NOTE: RangeIndexed is copyable but there still is overhead here
+        let converted_iter = iter.map(|interval| interval.to_owned());
+        Box::new(converted_iter)
+    }
+}
diff --git a/src/test_utilities.rs b/src/test_utilities.rs
index 856e5c2..4b330c0 100644
--- a/src/test_utilities.rs
+++ b/src/test_utilities.rs
@@ -1,9 +1,13 @@
 //! Test cases and test utility functions.
 //!
 
-use indexmap::IndexMap;
+use crate::{
+    create_granges_with_seqlens,
+    prelude::{GRanges, VecRangesIndexed},
+    ranges::{coitrees::COITrees, vec::VecRanges, RangeEmpty},
+    Position,
+};
 use rand::{thread_rng, Rng};
-use crate::{Position, ranges::{RangeIndexed, vec::VecRanges, RangeEmpty}};
 
 // Stochastic test ranges defaults
 //
@@ -34,8 +38,7 @@ pub fn random_range(chrom_len: Position) -> (Position, Position) {
 /// Build random sequence lengths
 pub fn random_seqlen() -> Position {
     let mut rng = thread_rng();
-    let rand_len = rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN);
-    rand_len
+    rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN)
 }
 
 /// Sample a random chromosome
@@ -56,6 +59,30 @@ pub fn random_vecranges(n: usize) -> VecRanges<RangeEmpty> {
     vr
 }
 
+/// Build random [`COITrees`] from a random [`VecRanges`].
+pub fn random_coitrees() -> COITrees<()> {
+    let vr = random_vecranges(100);
+    let cr: COITrees<()> = vr.into();
+    cr
+}
 
-
- 
+/// Range test case #1
+///
+/// Ranges:
+///   - chr1:
+///      (0, 5, Some(1.1))
+///      (4, 7, Some(8.1))
+///      (10, 17, Some(10.1))
+///   - chr2:
+///      (10, 20, Some(3.7))
+///      (18, 32, Some(1.1))
+///
+/// Seqlens: { "chr1" => 30, "chr2" => 100 }
+///
+/// Sum of all elements: 24.1
+pub fn granges_test_case_01() -> GRanges<VecRangesIndexed, Vec<f64>> {
+    create_granges_with_seqlens!(VecRangesIndexed, Vec<f64>, {
+        "chr1" => [(0, 5, 1.1), (4, 7, 8.1), (10, 17, 10.1)],
+        "chr2" => [(10, 20, 3.7), (18, 32, 1.1)]
+    }, seqlens: { "chr1" => 30, "chr2" => 100 })
+}
diff --git a/src/traits.rs b/src/traits.rs
index f473fb0..b7999ee 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -1,7 +1,84 @@
+//! Traits used by the GRanges library.
+//!
 
+use crate::{error::GRangesError, io::parsers::FilteredIntervals, ranges::RangeRecord};
+
+
+/// Defines functionality common to all range containers, e.g. [`VecRanges<R>`] and
+/// [`COITrees`]. 
 pub trait RangeContainer {
     fn len(&self) -> usize;
     fn is_empty(&self) -> bool {
         self.len() == 0
     }
 }
+
+/// Defines functionality for filtering [`RangeRecord`] entries based on their
+/// sequence names. [`RangeRecord`] are predominantly used in reading in data,
+/// so these trait methods simplify excluding or retaining ranges based on what
+/// sequence (i.e. chromosome) they are on.
+pub trait GeneralRangeRecordIterator<U>:
+    Iterator<Item = Result<RangeRecord<U>, GRangesError>> + Sized
+{
+    fn retain_seqnames(self, seqnames: Vec<String>) -> FilteredIntervals<Self, U>;
+    fn exclude_seqnames(self, seqnames: Vec<String>) -> FilteredIntervals<Self, U>;
+}
+
+/// The [`RangesIterable`] trait defines common functionality for iterating over
+/// the range types in range containers.
+pub trait RangesIterable<R> {
+    fn iter_ranges(&self) -> Box<dyn Iterator<Item = R> + '_>;
+}
+
+/// The [`RangesIntoIterable`] trait defines common functionality for *consuming* iterating
+/// over the range types in range containers.
+pub trait RangesIntoIterable<R> {
+    fn into_iter_ranges(self) -> Box<dyn Iterator<Item = R>>;
+}
+
+/// The main [`IndexedDataContainer`] type is used to interact
+/// with arbitrary *indexed* container of data of type `DataContainer::Item`.
+///
+/// GRanges provides [`IndexedDataContainer`] trait implementations for:
+///
+///  - [`ndarray::Array1`]
+///  - [`ndarray::Array2`]
+///  - [`Vec`]
+///  - [`polars::DataFrame`]
+///
+/// # Panics
+/// This will panic if `DataContainer.get_value()` tries to access
+/// an index that does not exist. This is an explicit design choice:
+/// we could have returned an `Option` or a `Result`. However,
+/// when the index is invalid, there is a serious problem in the creation
+/// of the `GRanges` object's set of ranges; this should not ever
+/// happen in normal operation. Returning `Option`/`Result` just to
+/// protect against this edge case would force most `GRanges`
+/// functions to return a `Result`. This would clog the API usage, so
+/// we opt to just panic.
+pub trait IndexedDataContainer<'a> {
+    // note: I don't think we can refactor out this lifetime,
+    // since it is needed often for associated types
+    type Item;
+    type Output;
+    // this type is needed to reference the core underlying type,
+    // eg to handle reference types
+    fn is_valid_index(&self, index: usize) -> bool;
+    fn get_value(&'a self, index: usize) -> <Self as IndexedDataContainer>::Item;
+    fn len(&self) -> usize;
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+    fn invalid_indices(&self, indices: &[usize]) -> Vec<usize> {
+        let mut invalid = Vec::new();
+        for &index in indices {
+            if !self.is_valid_index(index) {
+                invalid.push(index);
+            }
+        }
+        invalid
+    }
+
+    /// Create a new data container using a set of range indices.
+    fn new_from_indices(&self, indices: &[usize]) -> Self::Output;
+}
diff --git a/tests_data/example.bed b/tests_data/example.bed
new file mode 100644
index 0000000..0f40e10
--- /dev/null
+++ b/tests_data/example.bed
@@ -0,0 +1,5 @@
+chr1	10	20
+chr1	14	18
+chr2	4	5
+chr2	5	6
+chr4	9	15
diff --git a/tests_data/invalid.bed b/tests_data/invalid.bed
new file mode 100644
index 0000000..28d3141
--- /dev/null
+++ b/tests_data/invalid.bed
@@ -0,0 +1,5 @@
+chr1	-1	20
+chr1	14	18
+chr2	4	5
+chr2	5	6
+chr4	9	15
diff --git a/tests_data/noodles_example.bed b/tests_data/noodles_example.bed
new file mode 100644
index 0000000..eccc0b9
--- /dev/null
+++ b/tests_data/noodles_example.bed
@@ -0,0 +1,2 @@
+sq0	7	13
+sq0	20	34