Skip to content

Commit

Permalink
Redesign of how joins are stored in LeftGroupedJoin.
Browse files Browse the repository at this point in the history
 - Previously only overlap and data indices were stored in
   `LeftGroupedJoin`.  However, this makes downstream operations
   that need to compare, merge/flatten, etc the right ranges
   harder. So, now a `Vec<RangeTuples>` are stored. `RangeTuple`
   is a new minimal range type with `Option<usize>` index (they
   fit well in `LeftGroupedJoin`, since this option avoids having
   to carry indexed/empty types to this level, which makes prevents
   API clog.

 - `RangeTuple` and functionality added.

 - New experimental method `hist-feature` `--reduce` option which
   forces overlapping right ranges to create a new set of their
   feature types, e.g. "CDS,exon,5UTR", which allows for exhaustive
   and exclusive assignment of each basepair in a window to a particular
   "feature set". This isn't tested yet, hence experimental.

 - Commented out `IndexedDataContainer` methods that weren't used and
   just getting in the way.

 - New `UniqueIdentifier` type which is useful for key to `usize`
   mapping. Then, implemntations for `UniqueIdentifier` so it is a
   `IndexedDataContainer`. This allows for much more efficient storage
   of data in indexed data containers when one value is repeated a lot.
   Added `GRanges::new_vec_keyed()` and `GRanges::push_range_with_key()`
   to implement these methods (the former's name may change).

 - `RangeReduced` type, which stores "reduced" or flattend ranges.
   This is used in the `--reduce` option of `hist-features`.

 - `ConditionalMergingIterator` -- without the `Result`.

 - `reduce_ranges()` method.

 - Tests are now seeded. There is an incredibly rare edgecase
   difference with some stochastically-seeded random genomic
   ranges with `test_against_bedtools_map_multiple`. The results
   look like: bedtools: `Some(0.9625260632890147)`, granges: `None`.
   The cause of this bug, and whether it is a granges or bedtools
   issue is unknown. But it is rare.
  • Loading branch information
vsbuffalo committed Mar 5, 2024
1 parent 2e98a14 commit 3b6cd34
Show file tree
Hide file tree
Showing 12 changed files with 848 additions and 155 deletions.
235 changes: 186 additions & 49 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::{
ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty},
reporting::{CommandOutput, Report},
test_utilities::{random_granges, random_granges_mock_bed5},
unique_id::UniqueIdentifier,
Position, PositionOffset,
};

Expand Down Expand Up @@ -723,7 +724,7 @@ fn transpose<T>(v: Vec<Vec<T>>) -> Vec<Vec<T>> {
/// This is useful for a quick exploratory look at feature density
/// by window, e.g. with
///
/// $ hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \
/// $ granges hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \
/// hg38_seqlens.tsv --headers | xsv table -d'\t' | less
///
/// The xsv tool (https://github.com/BurntSushi/xsv) is useful for visualizing
Expand All @@ -750,7 +751,20 @@ pub struct HistFeatures {
#[arg(short, long)]
chop: bool,

/// Assign each basepair exclusively to a single "feature set"
/// based on what features overlap it, and count the number of
/// these per window. E.g. a region that overlaps "CDS" and "exon"
/// will be called a "CDS,exon" feature. Columns are sorted
/// in descending order by column sums. Headers will always be
/// used in this case.
///
/// ⚠️ Warning! This feature is *experimental* and currently does not
/// have unit or integration tests.
#[arg(short, long)]
exclusive: bool,

/// Whether to output the results as a TSV with headers.
/// When --classify is set, headers will always be output.
#[arg(short = 'l', long)]
headers: bool,

Expand All @@ -763,71 +777,194 @@ impl HistFeatures {
pub fn run(&self) -> Result<CommandOutput<()>, GRangesError> {
let bedfile = &self.bedfile;
let genome = read_seqlens(&self.genome)?;

let bed4_iter = Bed4Iterator::new(bedfile)?;

// Split the elements in the iterator by feature into multiple GRanges objects.
let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> = HashMap::new();
for result in bed4_iter {
let range = result?;
let feature = &range.data.name;
let vec = records_by_features.entry(feature.to_string()).or_default();
vec.push(range.into_empty()) // drop the feature, since we're hashing on it
}
if !self.exclusive {
// Split the elements in the iterator by feature into multiple GRanges objects.
let mut records_by_features: HashMap<String, Vec<GenomicRangeRecordEmpty>> =
HashMap::new();
for result in bed4_iter {
let range = result?;
let feature = &range.data.name;
let vec = records_by_features.entry(feature.to_string()).or_default();
vec.push(range.into_empty()) // drop the feature, since we're hashing on it
}

// Load all the *merged* split records into memory as interval trees.
let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
for (feature, ranges) in records_by_features.into_iter() {
// doing a streaming merge of each feature; bookend merge
let merging_iter = MergingEmptyIterator::new(ranges, 0);
// Load all the *merged* split records into memory as interval trees.
let mut gr_by_features: HashMap<String, GRangesEmpty<COITreesEmpty>> = HashMap::new();
for (feature, ranges) in records_by_features.into_iter() {
// doing a streaming merge of each feature; bookend merge
let merging_iter = MergingEmptyIterator::new(ranges, 0);

// load into memory and convert to interval trees
let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;
// load into memory and convert to interval trees
let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?;

assert!(!gr_by_features.contains_key(&feature));
gr_by_features.insert(feature, gr);
}
assert!(!gr_by_features.contains_key(&feature));
gr_by_features.insert(feature, gr);
}

// Now, create windows.
let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;

let features: Vec<_> = gr_by_features.keys().cloned().collect();
let mut feature_matrix = Vec::new();
for (_feature, gr) in gr_by_features.into_iter() {
// find the features that overlap each window
// TODO/OPTIMIZE: how costly is this clone?
let window_counts = windows.clone()
.left_overlaps(&gr)?
.map_joins(|joins| {
// these are merged, so this is the number of *unique* basepairs
let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
total_overlaps
})?
.take_data()?;
feature_matrix.push(window_counts);
}
// Now, create windows.
let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;

let features: Vec<_> = gr_by_features.keys().cloned().collect();
let mut feature_matrix = Vec::new();
for (_feature, gr) in gr_by_features.into_iter() {
// find the features that overlap each window
// TODO/OPTIMIZE: how costly is this clone?
let window_counts = windows
.clone()
.left_overlaps(&gr)?
.map_joins(|joins| {
// these are merged, so this is the number of *unique* basepairs
let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum();
total_overlaps
})?
.take_data()?;
feature_matrix.push(window_counts);
}

let feature_matrix = transpose(feature_matrix);
let feature_matrix = transpose(feature_matrix);

// Unite the windows with the feature matrix.
let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
let windows = windows.into_granges_data(feature_matrix)?;
// Unite the windows with the feature matrix.
let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;
let windows = windows.into_granges_data(feature_matrix)?;

// Write everything.
if !self.headers {
// we have to *manually* add headers (serde needs flat structs otherwise)
windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
// Write everything.
if !self.headers {
// we have to *manually* add headers (serde needs flat structs otherwise)
windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?;
} else {
let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
headers.extend(features);

let config = TsvConfig {
no_value_string: "NA".to_string(),
headers: Some(headers),
};
windows.write_to_tsv(self.output.as_ref(), &config)?;
}
} else {
// This is done a bit differently than the non-classify case

// Create GRanges of the feature indices.
// We do this manually to avoid creating a needless data container.
let mut gr = GRanges::new_vec_keyed(&genome);
for result in bed4_iter {
let range = result?;

// Note the trick here: we are *re-using* indices.
gr.push_range_with_key(&range.seqname, range.start, range.end, &range.data.name)?;
}
let gr = gr.into_coitrees()?;

// clone the feature map
let feature_map = gr.data().ok_or(GRangesError::NoDataContainer)?.clone();

// Create windows.
let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?;

// "Reduce" ranges and
let windows_overlaps = windows.left_overlaps(&gr)?.map_joins(|mut join| {
// "reduce" the ranges to a minimum spanning set that contains a vec of indices
let ranges = join.join.reduce_ranges();
let mut feature_overlaps: HashMap<Vec<usize>, _> = HashMap::new();
for range in ranges.iter() {
let mut key: Vec<usize> = range.indices().iter().map(|x| x.unwrap()).collect();
key.sort(); // sorted key is compound key
*feature_overlaps.entry(key).or_insert(0) += range.width();
}

feature_overlaps
})?;

// Now, we go over the data and get the observed sets
let mut observed_sets = UniqueIdentifier::new();
let data = windows_overlaps
.data()
.ok_or(GRangesError::NoDataContainer)?;
for feature_counts in data.iter() {
for feature_set in feature_counts.keys() {
observed_sets.get_or_insert(feature_set);
}
}
let nsets = observed_sets.len(); // total feature sets

// Go over data again, making the sparse hashmap into a full
// matrix of overlappng basepairs.
let gr = windows_overlaps.map_data(|feature_counts| {
// fill out matrix
let mut counts = vec![0; nsets];
for (i, feature_set) in observed_sets.keys().enumerate() {
counts[i] = *feature_counts.get(feature_set).unwrap_or(&0);
}
counts
})?;

// To make this prettier, let's sort by column totals (this
// is a little pricey; we can make option later if needed).
let matrix = gr.data().ok_or(GRangesError::NoDataContainer)?;
let col_totals = column_totals(matrix);
let sorted_indices = sorted_indices_by_values(&col_totals);

let gr = gr.map_data(|row| {
let row: Vec<_> = sorted_indices.iter().map(|i| row[*i]).collect();
row
})?;

// Create the labels.
let feature_sets: Vec<String> = observed_sets
.keys()
.map(|indices_key| {
let labels: Vec<_> = indices_key
.iter()
.map(|idx| feature_map.get_key(*idx).unwrap().clone())
.collect();
labels.join(",")
})
.collect();

// Re-order the headers too by the sorted indices.
let feature_sets: Vec<_> = sorted_indices.iter()
.map(|i| feature_sets[*i].clone()).collect();

let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
headers.extend(features);
headers.extend(feature_sets);

let config = TsvConfig {
no_value_string: "NA".to_string(),
headers: Some(headers),
};
windows.write_to_tsv(self.output.as_ref(), &config)?;
gr.write_to_tsv(self.output.as_ref(), &config)?;
}

Ok(CommandOutput::new((), None))
}
}


// get column totals
fn column_totals(matrix: &Vec<Vec<Position>>) -> Vec<Position> {
if matrix.is_empty() || matrix[0].is_empty() {
return Vec::new();
}

let num_columns = matrix[0].len();
let mut totals = vec![0; num_columns];

for row in matrix {
for (i, &value) in row.iter().enumerate() {
if i < totals.len() {
totals[i] += value;
}
}
}

totals
}

// get descending indices order
fn sorted_indices_by_values(values: &[Position]) -> Vec<usize> {
let mut indices: Vec<usize> = (0..values.len()).collect();
indices.sort_by_key(|&i| std::cmp::Reverse(values[i]));
indices
}

32 changes: 16 additions & 16 deletions src/data/ndarray.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ where
{
type Item<'a> = U;
type OwnedItem = U;
type Output = Array1<U>;
// type Output = Array1<U>;

fn get_value(&self, index: usize) -> Self::Item<'_> {
self[index]
Expand All @@ -165,9 +165,9 @@ where
index < self.shape()[0]
}

fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
}
// fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
// Array1::from_iter(indices.iter().map(|&idx| self.get_value(idx)))
// }
}

impl<U> IndexedDataContainer for Array2<U>
Expand All @@ -176,7 +176,7 @@ where
{
type Item<'a> = ArrayView1<'a, U>;
type OwnedItem = Array1<U>;
type Output = Array2<U>;
// type Output = Array2<U>;

fn get_value(&self, index: usize) -> Self::Item<'_> {
self.row(index)
Expand All @@ -194,19 +194,19 @@ where
index < self.shape()[0]
}

fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
let cols = self.shape()[1];
//fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
// let cols = self.shape()[1];

let rows_data: Vec<U> = indices
.iter()
.flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
.collect();
// let rows_data: Vec<U> = indices
// .iter()
// .flat_map(|&idx| self.row(idx).iter().cloned().collect::<Vec<_>>())
// .collect();

// create a new Array2<U> from the rows
// shape is (number of indices, number of columns)
Array2::from_shape_vec((indices.len(), cols), rows_data)
.expect("Shape and collected data size mismatch")
}
// // create a new Array2<U> from the rows
// // shape is (number of indices, number of columns)
// Array2::from_shape_vec((indices.len(), cols), rows_data)
// .expect("Shape and collected data size mismatch")
//}
}

#[cfg(test)]
Expand Down
8 changes: 4 additions & 4 deletions src/data/vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ where
{
type Item<'a> = &'a U where Self: 'a;
type OwnedItem = U;
type Output = Vec<U>;
// type Output = Vec<U>;

fn get_value(&self, index: usize) -> Self::Item<'_> {
self.get(index).unwrap()
Expand All @@ -29,7 +29,7 @@ where
self.get(index).is_some()
}

fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
}
// fn new_from_indices(&self, indices: &[usize]) -> Self::Output {
// Vec::from_iter(indices.iter().map(|&idx| (*self.get_value(idx)).clone()))
//}
}
Loading

0 comments on commit 3b6cd34

Please sign in to comment.