From 9d1cf48add93c8a5737a9126b383f850a3242ce8 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Sun, 25 Feb 2024 18:28:43 -0800
Subject: [PATCH] Benchmark windows and lots of doc clarity updates

---
 benches/bedtools_comparison.rs |  48 ++++++-
 src/granges.rs                 |   8 +-
 src/io/parsers.rs              |  29 -----
 src/lib.rs                     | 232 ++++++++++++++++++++++++++-------
 src/sequences/mod.rs           |  11 +-
 src/traits.rs                  |  14 +-
 6 files changed, 244 insertions(+), 98 deletions(-)
diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs
index 8d5a677..8f281c5 100644
--- a/benches/bedtools_comparison.rs
+++ b/benches/bedtools_comparison.rs
@@ -18,7 +18,7 @@ fn bench_range_adjustment(c: &mut Criterion) {
     let input_bedfile = random_bed3file(BED_LENGTH);
 
     // configure the sample size for the group
-    // group.sample_size(10);
+    group.sample_size(10);
 
     // bedtools slop
     group.bench_function("bedtools_slop", |b| {
@@ -145,10 +145,54 @@ fn bench_flank(c: &mut Criterion) {
     });
 }
 
+fn bench_windows(c: &mut Criterion) {
+    let width = 100_000;
+    let step = 1_000;
+
+    // create the benchmark group
+    let mut group = c.benchmark_group("windows");
+
+    // configure the sample size for the group
+    // group.sample_size(10);
+    group.bench_function("bedtools_makewindows", |b| {
+        b.iter(|| {
+            let bedtools_output = Command::new("bedtools")
+                .arg("makewindows")
+                .arg("-g")
+                .arg("tests_data/hg38_seqlens.tsv")
+                .arg("-w")
+                .arg(width.to_string())
+                .arg("-s")
+                .arg(step.to_string())
+                .output()
+                .expect("bedtools makewindows failed");
+            assert!(bedtools_output.status.success());
+        });
+    });
+
+    group.bench_function("granges_windows", |b| {
+        b.iter(|| {
+            let granges_output = Command::new(granges_binary_path())
+                .arg("windows")
+                .arg("--genome")
+                .arg("tests_data/hg38_seqlens.tsv")
+                .arg("--width")
+                .arg(width.to_string())
+                .arg("--step")
+                .arg(step.to_string())
+                .output()
+                .expect("granges windows failed");
+            assert!(granges_output.status.success());
+        });
+    });
+}
+
+
 criterion_group!(
     benches,
     bench_filter_adjustment,
     bench_range_adjustment,
     bench_flank,
-);
+    bench_windows,
+    );
 criterion_main!(benches);
diff --git a/src/granges.rs b/src/granges.rs
index 442a24a..25dd1cd 100644
--- a/src/granges.rs
+++ b/src/granges.rs
@@ -632,7 +632,7 @@ where
     /// [`GRanges<VecRangesIndexed, JoinData>`].
     ///
     /// The [`JoinData`] container contains the owned left data container and has
-    /// a reference to the right data container, as as well as a [`Vec<LeftGroupedJoin`]
+    /// a reference to the right data container, as as well as a [`Vec<LeftGroupedJoin>`]
     /// that contains information about each overlap between a left and zero or more right
     /// ranges.
     fn left_overlaps(
@@ -674,7 +674,7 @@ where
     /// [`GRanges<VecRangesIndexed, JoinDataRightEmpty>`].
     ///
     /// The [`JoinData`] container contains the left data container and has
-    /// a reference to the right data container, as as well as a [`Vec<LeftGroupedJoin`]
+    /// a reference to the right data container, as as well as a [`Vec<LeftGroupedJoin>`]
     /// that contains information about each overlap between a left and zero or more right
     /// ranges.
     fn left_overlaps(
@@ -726,7 +726,7 @@ where
     /// [`GRanges<VecRangesIndexed, JoinDataLeftEmpty>`].
     ///
     /// The [`JoinDataLeftEmpty`] contains no left data, and a reference to the
-    /// right data container, as as well as a [`Vec<LeftGroupedJoin`]
+    /// right data container, as as well as a [`Vec<LeftGroupedJoin>`]
     /// that contains information about each overlap between a left and zero or more right
     /// ranges.
     fn left_overlaps(
@@ -778,7 +778,7 @@ where
     /// [`GRanges<VecRangesIndexed, JoinDataBothEmpty>`].
     ///
     /// The [`JoinDataBothEmpty`] contains no data, since neither left of right
-    /// [`GRanges`] objects had data. However, it does contain a [`Vec<LeftGroupedJoin`],
+    /// [`GRanges`] objects had data. However, it does contain a [`Vec<LeftGroupedJoin>`],
     /// and each [`LeftGroupedJoin`] contains information about the number of overlapping
     /// ranges and their lengths. This can be used to summarize, e.g. the number
     /// of overlapping basepairs, the overlap fraction, etc.
diff --git a/src/io/parsers.rs b/src/io/parsers.rs
index 117fed8..b16cf48 100644
--- a/src/io/parsers.rs
+++ b/src/io/parsers.rs
@@ -70,35 +70,6 @@
 //! iterator types defined in [`GenomicRangesParser`]. The best examples of this are the
 //! `granges` subcommands implementations.
 //!
-//! ## File Format Terminology
-//!
-//!  - BED3
-//!  - BED*
-//!  - BED-like
-//!
-//!
-//! : it could be ranges-only (i.e. a BED3), or contain data (e.g. BED5). The lazy BED parser will
-//! output a [`GenomicRangeRecord<Option<String>>`], where the data would be `None` only in the
-//! case that three columns were encountered in the file (which must be a BED3).
-//!
-//! In GRanges, there are two types of ranges: ranges with an index to an element in the data
-//! container, and ranges without indices (i.e. what we would use in the case of processing a BED3
-//! file). Since a [`GRanges`] object needs to have a single, concrete range type in its range
-//! containers, it must be known *at compile time* how one should convert the
-//!
-//! Downstream pipelines must immediately determine how to handle whether there is additional data,
-//! or all of the [`GenomicRangeRecord`] entries are `None`, and
-//!
-//!
-//! # BED-like File Parser Design
-//!
-//! All BED formats (BED3, BED5, etc) are built upon a BED3. Often when working with these types
-//! of formats, many operations do not immediately require full parsing of the line, past the
-//! *range components*. This is because downstream operations may immediately filter away entries
-//! (e.g. based on width), or do an overlap operation, and then filter away entries based on some
-//! overlap criteria. Either way, it may be advantageous to work with ranges that just store some
-//! generic data.
-//!
 //! [`GRanges`]: crate::granges::GRanges
 //! [`GRanges<R, T>`]: crate::granges::GRanges
 //! [`GRangesEmpty`]: crate::granges::GRangesEmpty
diff --git a/src/lib.rs b/src/lib.rs
index c4b41c4..64af9f1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,30 +5,128 @@
 //! # GRanges: Generic Genomic Range and Data Containers
 //!
 //! GRanges is a Rust library for working with genomic ranges and their associated data. GRanges
-//! also provides a separate command line tool built using this library, which currently
-//! implements a subset of the features of other great bioinformatics utilities like
-//! [bedtools](https://bedtools.readthedocs.io/en/latest/).
-//!
-//! The idea of GRanges is to facilitate building a new generation of robust, reproducible, and
-//! easily installable bioinformatics tools in Rust. It aims to lower the barrier to building
-//! simple bioinformatics data processing tools in Rust for biologists that are interested
-//! in using the language in their work. Even biologists that have no interest in learning Rust
-//! should benefit from Rust-based tools since they are fast. Currently GRanges is engineered
-//! as a set of compile-time generic types and methods, though future versions may implement
-//! analagous runtime dynamic data structures that would allow the library to be wrapped and used
-//! by other languages.
+//! also implements a [bedtools](https://bedtools.readthedocs.io/en/latest/)-like command line tool
+//! as a test and benchmark of the library. In benchmarks, this command line tool is reliably
+//! 30-40% faster than bedtools. Internally, GRanges uses the very fast
+//! [coitrees](https://github.com/dcjones/coitrees) library written by Daniel C. Jones for overlap
+//! operations.
+//!
+//! The GRanges library aims to simplify the creation of powerful, performant genomics tools in
+//! Rust. GRanegs is inspired by the design and ease of use of Bioconductor's
+//! [GenomicRanges](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003118)
+//! and [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html)
+//! packages, but tools using GRanges are optimized at compile time and thus are typically much
+//! faster.
+//!
+//! ## GRanges Design
+//!
+//! Nearly all common data processing and analytics tasks in science can be thought of as taking
+//! some raw input data and refining it down some pipeline, until eventually outputting some sort
+//! of statistical result. In bioinformatics, genomics data processing workflows are often composed
+//! from command line tools and Unix pipes. Analogously, tidyverse unifies data tidying,
+//! summarizing, and modeling by passing and manipulating a datafame down a similar pipeline. Unix
+//! pipes and tidyverse are powerful because they allow the scientist to build highly specialized
+//! analytic tools just by remixing the same core set of data operations. 
+//!
+//! GRanges implements many of the same core data joining and manipulation operations as
+//! [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html) library and
+//! R's [tidyverse](https://www.tidyverse.org). 
+//!
+//! ## The [`GRanges`] container
+//!
+//! Central to the GRanges library is the generic [`GRanges<R, T>`] data container. This is generic
+//! over the *range container* type `R`, because range data structures have inherent performance
+//! tradeoffs for different tasks. For example, when parsing input data into memory, the total size
+//! is often not known in advanced. Consequently, the most efficient range container data structure
+//! is a dynamically-growing [`Vec<U>`]. However, when overlaps need to be computed between ranges
+//! across two containers, the *interval tree* data structures are needed. [`GRanges`] supports
+//! conversion between these range container types. 
+//!
+//! [`GRanges<R, T>`] is also generic over its *data container*, `T`. This allows it to hold *any*
+//! type of data, in any data structure: a [`Vec<f64>`] of numeric stores, an
+//! [`ndarray::Array2`](https://docs.rs/ndarray/latest/ndarray/index.html) matrix, a
+//! [polars](https://pola.rs) dataframe, custom `struct`s, etc. Each range in the ranges container
+//! indexes an element in the data container. There is also a special [`GRangesEmpty`] type for
+//! data-less [`GRanges`] objects.
+//!
+//! The [`GRanges`] type implements a small but powerful set of methods that can be used to process
+//! and analyze genomic data. GRanges relies heavily on *method-chaining* operations that pass
+//! genomic range data down a pipeline, similar to tidyverse and Unix pipes.
+//!
+//! ## The Overlaps--Map-Combine Pattern
+//!
+//! A very common data analysis strategy is **Split-Apply-Combine** pattern ([Wickham,
+//! 2011](https://vita.had.co.nz/papers/plyr.html)). When working with genomic overlaps
+//! and their data, GRanges library relies on its own data analysis pattern known 
+//! as **Overlaps--Map-Combine** pattern, which is better suited for common genomic operations and analyses.
+//!
+//! ### Overlap Joins and the Left Grouped Join
+//!
+//! In this pattern, the *left* genomic ranges are "joined" by their *overlaps* with another
+//! *right* genomic ranges. In other words, the *right ranges* that overlap a single *left range* can 
+//! be thought of "joined" to this left range.
+//!
+//! With genomic data, downstream processing is greatly simplified if the results of this join 
+//! are *grouped* by what left range they overlap (see illustration below). This is a special 
+//! kind of join known as a **left grouped join**. 
+//!
+//!
+//! ```text
+//!     Left range:   ███████████████████████████████████
+//!                                                      
+//!   Right ranges:     ██    ██████        ██████    ██████
+//! ```
+//!
+//! In GRanges, a left grouped join ([`GRanges::left_overlaps()`]) returns a [`GRanges`] object
+//! containing the exact same ranges as the input *left ranges*. In other words, *left grouped
+//! joins* are *endomorphic in the range container*. Computationally, this is also extremely
+//! efficient, because the left ranges can be passed through to the results directly (a
+//! zero-overheard operation). 
+//!
+//! 
+//!
+//! The [`GRanges`] object returned by [`GRanges::left_overlaps()`] contains a [`JoinData`] (or related)
+//! type as its data container. The [`JoinData`] contains information about each left range's overlaps (e.g. number of 
+//! overlapping bases, fraction of overlaps, etc) in a [`Vec<LeftGroupedJoin>`]. This information about 
+//! overlapping ranges may then be used by downstream calculations.
+//! Additionally [`JoinData`] stores the left ranges' data container, and has a *reference* to 
+//! the right ranges' data container. In both cases, the data is *unchanged* by the join. 
+//! Downstream processing can also easily access the left ranges's data and all overlapping right ranges'
+//! data for calculations.
+//! 
+//! ### Map-Combine over Joins
+//!
+//! After a left grouped join, each left range can have zero or more overlapping right ranges.
+//! A *map-combine* operation (optionally) maps a function over all right ranges' data and
+//! overlaps information, and then combines that into a single data entry per left range. These
+//! combined data entries make up a new [`GRanges<R, Vec<V>>`] data container, returned by [`GRanges::map_over_joins()`].
+//!
+//! Note that like [`GRanges::left_overlaps()`], the [`GRanges::map_over_joins()`] is endomorphic 
+//! over its range container. This means it can be passed through without modification, which 
+//! is computationally efficient. This results from a Map-Combine operation then can be overlap joined with 
+//! other genomic ranges, filtered, have its data arbitrarily manipulated by [`GRanges::map_data()`], etc.
 //!
 //! ## Example
 //!
-//! The best introduction to the GRanges library is an example illustrating
-//! how a common genomic data processing task can be expressed in few lines
-//! of (relatively simple) code. Below is an annotated implementation of
-//! `granges map`, which provides the same functionality of `bedtools map`:
-//! arbitrary operations (e.g. mean, median, max, etc) are applied to the numeric
-//! scores of all right ranges that overlap a each left range:
+//! To illustrate these ideas, lets look at an example of how we might use GRanges to do a
+//! a commonly-encountered genomic calculation. Suppose you had a set of exon genomic ranges (the *left
+//! ranges*) and a multicolumn TSV (*right ranges*) of various genomic features (e.g. assay
+//! results, recombination hotspots, variants, etc). Imagine you wanted to form some statistic per
+//! each exon, based on data in the overlapping right ranges. This operation is very
+//! similar to `bedtools map`, except that suppose the statistic you want to calculate is not one of the 
+//! few offered by bedtools. GRanges makes it simple to compose your own, very fast genomic tools to answer these
+//! questions.
+//!
+//! To see how GRanges would make it simple to compose a specialized fast tool to solve this
+//! problem, let's fist see how few lines of code it would take to implement `bedtools map`,
+//! since our problem is akin to using `bedtools map` with our own function to calculate our statistic.
+//! Let's start by getting the mean score by seeing how to get the mean BED5 score across all 
+//! overlapping right ranges for each left range (i.e. `bedtools map -a <left> -b <right> -c 5 mean`). 
+//! Here is the Rust code to do this using GRanges:
 //!
 //! ```
-//! use granges::prelude::*;
+//! # use granges::prelude::*;
+//! # fn try_main() -> Result<(), granges::error::GRangesError> {
 //!
 //! // Data for example:
 //! let genome = seqlens!("chr1" => 100, "chr2" => 100);
@@ -39,10 +137,8 @@
 //! let seqnames: Vec<String> = genome.keys().cloned().collect();
 //!
 //! // Create parsing iterators to the left and right BED files.
-//! let left_iter = Bed3Iterator::new(left_path)
-//!                   .expect("error inferring filetype");
-//! let right_iter = Bed5Iterator::new(right_path)
-//!                   .expect("error inferring filetype");
+//! let left_iter = Bed3Iterator::new(left_path).expect("error inferring filetype");
+//! let right_iter = Bed5Iterator::new(right_path).expect("error inferring filetype");
 //!
 //! // Filter out any ranges from chromosomes not in our genome file.
 //! let left_gr = GRangesEmpty::from_iter(left_iter.retain_seqnames(&seqnames), &genome)
@@ -79,14 +175,17 @@
 //! // for missing values, etc.
 //! let path = Some("map_results.bed.gz");
 //! result_gr.to_tsv(path, &BED_TSV).expect("error writing output");
+//! # Ok(())
+//! # }
+//! # fn main() { try_main().unwrap(); }
 //! ```
 //!
-//! ## Design Overview
+//! In relatively few lines of code, we can implement core functionality of bedtools.
+//! As mentioned earlier, GRanges code typically runs 30-40% faster than bedtools.
 //!
-//! To understand how GRanges could be useful in processing genomic data, it's helpful to first
-//! understand the overall design of this software library.
+//! ## Loading Data into GRanges
 //!
-//! Nearly all range data is loaded into GRanges using **parsing iterators** (see the [parsers]
+//! Nearly all genomic range data enters GRanges through **parsing iterators** (see the [parsers]
 //! module). Parsing iterators read input data from some bioinformatics file format (e.g. `.bed`,
 //! `.gtf.gz`, etc) and it then goes down one of two paths:
 //!
@@ -99,8 +198,12 @@
 //!     [`GRanges`] objects then provide high-level in-memory methods for working with this
 //!     genomic data.
 //!
-//! Both modes then eventually write something to output: a TSV of some statistic calculated on the
-//! genomic data, the output of 10 million block bootstraps, or the coefficients of
+//! In the example above, we loaded the items in the parsing iterators directly into 
+//! [`GRanges`] objects, since we had to do overlap operations (in the future, GRanges will
+//! support streaming overlap joins for position-sorted input data).
+//!
+//! Both processing modes eventually output something, e.g. a TSV of some statistic calculated 
+//! on the genomic data, the output of 10 million block bootstraps, or the coefficients of
 //! linear regressions run every kilobase on sequence data.
 //!
 //! The GRanges workflow idea is illustrated in this (super cool) ASCII diagram:
@@ -125,28 +228,41 @@
 //!
 //! ```
 //!
-//! Nearly all common data processing and analytics tasks in science can be thought of as taking
-//! some raw input data and refining it down some pipeline, until eventually outputting it as some sort of statistical
-//! result. Bioinformatics has been built on composing workflows out of command line tools and
-//! pipes. Analogously, tidyverse unifies data tidying, summarizing, and modeling by passing data
-//! in a datafame down a similar pipeline-based interface. Both operations are simple to use
-//! because they compose the same essential *operations* into specific data processing workflows.
+//! ## Manipulating GRanges objects 
 //!
-//! GRanges is designed to emulate these powerful data analytics tools. It implements core
-//! operations on genomic ranges and their associated data, and provides generic, extensible
-//! data structures for storing genomic data.
+//! 1. *Creation*: [`GRanges::new_vec()`], [`GRanges::from_iter()`], [`GRangesEmpty::from_windows()`].
 //!
-//! GRanges attempts to implement the same core set of data joining and processing tools as the
-//! [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html) library and
-//! R's [tidyverse](https://www.tidyverse.org). It also frames these genomic operations in
-//! standard data analytic and database terms. For example, the operation `bedtools intersect -wa -u -a <left>
-//! -b <right>` finds all ranges in the *left* set of genomic ranges that overlaps one or more
-//! *right* genomic ranges, and reports only the unique left range. This operation is common in
-//! many data analytics workflows and database queries, and is known as a *semi join* between a
-//! a left table and right table. (see for example this section in Hadley Wickham's book [R for Data Science](https://r4ds.hadley.nz/joins.html#filtering-joins)).
-//! In the `granges` command line tool, this is implemented in the `granges filter` subcommand. In
-//! the library, this subcommand is just the method [`GRanges::filter_overlaps()`].
+//! 2. *Range-modifying* functions: [`GRanges::into_coitrees()`], [`GRanges::adjust_ranges()`], [`GRanges::sort()`],
+//!        [`GRanges::flanking_ranges()`], [`GRanges::filter_overlaps()`].
+//!
+//! 3. *Data-modifying* functions: [`GRanges::left_overlaps()`], [`GRanges::map_over_joins()`], [`GRanges::map_data()`].
+//!
+//! 4. *Range and Data modifying* functions: [`GRanges::push_range()`].
+//!
+//! Note that not all range and data container types support these operations, e.g.
+//! [`GRanges::push_range()`] is not available for ranges stored in an interval tree.
+//! However, by design, this is known *at compile time*, due Rust's typing system. Thus 
+//! there is lower risk of runtime panics, since more potential issues are caught at 
+//! compile time.
+//!
+//! ## The (Active) Future
+//!
+//! When I was first learning Rust as a developer, one thing that struck me about the language
+//! is that it feels *minimal but never constraining*. This is quite a contrast compared to 
+//! languages like C++. As Rust developers know, the minimalness was by design; the
+//! cultural norm of slow growth produced a better final product in the end. 
+//!
+//! The GRanges library attempts to follow this design too. Currently, the alpha release is
+//! about implementating a subset of essential core functionality to benchmark against 
+//! alternative software. Since the design and ergonomics of the API are under active development,
+//! please, *please*, file a GitHub issue if:
+//!
+//!  1. You want a particular feature.
+//!
+//!  2. You don't know how you'd implement a feature yourself with GRanges.
 //!
+//!  3. The ["ergonomics"](https://blog.rust-lang.org/2017/03/02/lang-ergonomics.html) don't
+//!     feel right.
 //!
 //! ## Documentation Guide
 //!
@@ -155,10 +271,26 @@
 //!     or is "sharp" (developer error could cause a panic).
 //!  - 🚀: optimization tip.
 //!
+//! [`JoinData`]: crate::join::JoinData
 //! [`GRanges`]: crate::granges::GRanges
-//! [`GRanges::filter_overlaps()`]: crate::granges::GRanges
+//! [`GRangesEmpty`]: crate::granges::GRangesEmpty
+//! [`GRanges::filter_overlaps()`]: granges::GRanges::filter_overlaps
 //! [`GRanges<R, T>`]: crate::granges::GRanges
 //! [parsers]: crate::io::parsers
+//! [`ndarray::Array2`]: ndarray::Array2
+//! [`GRanges::left_overlaps()`]: crate::traits::LeftOverlaps::left_overlaps
+//! [`GRanges<R, Vec<V>>`]: crate::granges::GRanges
+//! [`GRanges::map_over_joins()`]: crate::granges::GRanges::map_over_joins
+//! [`GRanges::map_data()`]: crate::granges::GRanges::map_data
+//! [`GRanges::new_vec()`]: crate::granges::GRanges::new_vec
+//! [`GRanges::into_coitrees()`]: crate::granges::GRanges::into_coitrees
+//! [`GRanges::adjust_ranges()`]: crate::granges::GRanges::adjust_ranges
+//! [`GRanges::push_range()`]: crate::granges::GRanges::push_range
+//! [`GRanges::flanking_ranges()`]: crate::granges::GRanges::flanking_ranges
+//! [`GRanges::sort()`]: crate::granges::GRanges::sort
+//! [`GRanges::from_iter()`]: crate::granges::GRanges::from_iter
+//! [`GRangesEmpty::from_windows()`]: crate::granges::GRangesEmpty::from_windows
+// TODO broken links, gr!
 
 pub use indexmap;
 
@@ -183,7 +315,7 @@ pub mod reporting;
 /// This type is currently an unwrapped [`u32`]. This should handle
 /// chromosome lengths for nearly all species. In fact, the only exception
 /// known so far is lungfush (*Neoceratodus forsteri*), which has a chromosomes
-/// that reaches 5.4Gb (https://www.nature.com/articles/s41586-021-03198-8l).
+/// that reaches 5.4Gb (<https://www.nature.com/articles/s41586-021-03198-8l>).
 /// The [`u32::MAX`] is 4,294,967,295, i.e. 4.29 Gigabases, which means [`u32`] is
 /// just barely suitable for even the largest known chromosome. There is a
 /// performance and memory-efficiency tradeoff when using [`u64`] over [`u32`],
diff --git a/src/sequences/mod.rs b/src/sequences/mod.rs
index 6bfc124..3cceb40 100644
--- a/src/sequences/mod.rs
+++ b/src/sequences/mod.rs
@@ -10,18 +10,17 @@
 //! ## Main Functionality
 //!
 //!  - Lazy-loading by sequence (e.g. chromosome) with the [`LazyLoader`] type.
+//!
 //!  - If the `--features=ndarray` is set, one and two dimensional per-basepair
-//!    numeric arrays/matrices *in-memory*, with [`NumericSequences1`] and
-//!    [`NumericSequences2`], or *lazy loaded* with [`LazyNumericSequences2`]
-//!    ([``LazyNumericSequences1`] will be added soon, please file a [GitHub
-//!    issue](https://github.com/vsbuffalo/granges/issues) if you need it).
+//!    numeric arrays/matrices *in-memory*, with `NumericSequences1` and
+//!    `NumericSequences2`, or *lazy loaded* with `LazyNumericSequences2`
+//!    (`LazyNumericSequences1` will be added soon, please file a [GitHub
+//!    issue](https://github.com]/vsbuffalo/granges/issues) if you need it).
 //!
 //!
 //! [`Sequences`]: crate::traits::Sequences
 //! [`LazyLoader`]: crate::sequences::lazy
 //!
-//! [`NumericSequences1`]: crate::sequences::numeric::NumericSequences1
-//! [`NumericSequences2`]: crate::sequences::numeric::NumericSequences2
 
 pub mod lazy;
 pub mod nucleotide;
diff --git a/src/traits.rs b/src/traits.rs
index 2d71aba..e76c5ec 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -89,11 +89,12 @@ pub trait GenericRange: Clone {
 
 /// The [`JoinDataOperations`] trait unifies common operations
 /// over combined join data types ([`CombinedJoinData`],
-/// CombinedJoinDataBothEmpty`], etc).
+/// [`CombinedJoinDataBothEmpty`], etc).
 ///
 ///
-/// [`CombinedJoinData`] crate::granges::join::CombinedJoinData
-/// [`CombinedJoinDataBothEmpty`] crate::granges::join::CombinedJoinDataBothEmpty
+/// [`JoinDataOperations`]: crate::traits::JoinDataOperations
+/// [`CombinedJoinData`]: crate::join::CombinedJoinData
+/// [`CombinedJoinDataBothEmpty`]: crate::join::CombinedJoinDataBothEmpty
 pub trait JoinDataOperations<DL, DR> {
     type LeftDataElementType;
     type RightDataElementType;
@@ -209,10 +210,9 @@ pub trait IntoIterableRangesContainer<R> {
 ///
 /// GRanges provides [`IndexedDataContainer`] trait implementations for:
 ///
-///  - [`ndarray::Array1`]
-///  - [`ndarray::Array2`]
-///  - [`Vec`]
-///  - [`polars::DataFrame`]
+///  - [`Vec`], this is the most common and general data container.
+///  - [ndarray](https://docs.rs/ndarray/latest/ndarray/index.html) `Array1` and `Array2`, if `--features=ndarray` is set.
+///  - [polars](https://pola.rs) datafames.
 ///
 /// # Panics
 /// This will panic if `DataContainer.get_value()` tries to access