Skip to content

Commit

Permalink
New antifilter_overlaps().
Browse files Browse the repository at this point in the history
 - New `antifilter_overlaps()` for `GRanges` and `GRangesEmpty`.
 - New test case for `GRanges` case (`GRangesEmpty` missing).
 - New base fucntion for `GRanges` that semi/anti filtering join
   cases wrap.
  • Loading branch information
vsbuffalo committed Mar 1, 2024
1 parent cd05f1f commit 0863d3f
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 35 deletions.
28 changes: 15 additions & 13 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Command functions that implement each of the `granges` subcommands.

use std::{path::PathBuf, io, fs::File};
use std::{fs::File, io, path::PathBuf};

use csv::WriterBuilder;

Expand All @@ -11,7 +11,7 @@ use crate::{
tsv::BED_TSV,
},
prelude::*,
ranges::{operations::adjust_range, GenomicRangeRecordEmpty, GenomicRangeRecord},
ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty},
reporting::{CommandOutput, Report},
test_utilities::{random_granges, random_granges_mock_bed5},
Position, PositionOffset,
Expand Down Expand Up @@ -94,9 +94,9 @@ pub fn granges_adjust(

if skipped_ranges > 0 {
report.add_issue(format!(
"{} ranges were removed because their widths after adjustment were ≤ 0",
skipped_ranges
))
"{} ranges were removed because their widths after adjustment were ≤ 0",
skipped_ranges
))
}
}
} else {
Expand Down Expand Up @@ -161,7 +161,7 @@ pub fn granges_filter(
right_path: &PathBuf,
output: Option<&PathBuf>,
skip_missing: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();

Expand Down Expand Up @@ -200,7 +200,7 @@ pub fn granges_filter(
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
)?;
} else {
left_gr = GRangesEmpty::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
Expand Down Expand Up @@ -243,7 +243,7 @@ pub fn granges_filter(
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
Expand Down Expand Up @@ -292,7 +292,7 @@ pub fn granges_flank(
output: Option<&PathBuf>,
skip_missing: bool,
mode: ProcessingMode,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();
let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?;
Expand Down Expand Up @@ -424,7 +424,7 @@ pub fn granges_map(
operations: Vec<Operation>,
output: Option<&PathBuf>,
skip_missing: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();
let report = Report::new();
Expand Down Expand Up @@ -478,7 +478,9 @@ pub fn granges_map(
operations
.iter()
.map(|operation| {
operation.run(&mut overlap_scores).into_serializable(&BED_TSV)
operation
.run(&mut overlap_scores)
.into_serializable(&BED_TSV)
})
.collect::<Vec<SerializableDatumType>>()
})?;
Expand All @@ -495,7 +497,7 @@ pub fn granges_windows(
step: Option<Position>,
chop: bool,
output: Option<impl Into<PathBuf>>,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?;
let report = Report::new();
Expand All @@ -509,7 +511,7 @@ pub fn granges_random_bed(
output: Option<impl Into<PathBuf>>,
sort: bool,
bed5: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

Expand Down
155 changes: 136 additions & 19 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1280,15 +1280,50 @@ impl<CL: RangeContainer> GRangesEmpty<CL>
where
CL: IterableRangeContainer,
{
/// Filter out ranges that do *not* have at least overlap with the `right` ranges.
/// Exclude genomic ranges in this object that have any overlaps
/// with the `right` set of genomic ranges.
///
/// In database lingo, this is a type of *filtering join*, in particular a *semi join*.
/// See Hadley Wickham's excellent [R for Data
/// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information.
/// This will exclude the *entire* left range, if it had *any*
/// overlaps with *any* right genomic ranges.
///
/// Note that this consumes the `self` [`GRanges`] object, turning it into a new
/// [`GRanges<VecRangesEmpty, ()>`].
/// This is a type of *filtering join*, in particular a *anti-join*.
/// See Hadley Wickham's [R for Data Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information.
pub fn antifilter_overlaps<'a, M: Clone + 'a, DR: 'a>(
self,
// right: &GRanges<COITrees<M>, DR>,
right: &'a impl AsGRangesRef<'a, COITrees<M>, DR>,
) -> Result<GRangesEmpty<VecRangesEmpty>, GRangesError> {
let mut gr = GRangesEmpty::new_vec(&self.seqlens());

let right_ref = right.as_granges_ref();

for (seqname, left_ranges) in self.0.ranges.iter() {
for left_range in left_ranges.iter_ranges() {
if let Some(right_ranges) = right_ref.ranges.get(seqname) {
let has_overlaps =
right_ranges.count_overlaps(left_range.start(), left_range.end()) > 0;
if !has_overlaps {
gr.push_range(seqname, left_range.start(), left_range.end())?;
}
} else {
// if this left range's chrom doesn't exist in right, it doesn't have
// overlaps, so we push
gr.push_range(seqname, left_range.start(), left_range.end())?;
}
}
}
Ok(gr)
}

/// Retain only genomic ranges that have at least one overlap with the `right`
/// set of genomic ranges. The whole range will be retained.
///
/// This will retain the *entire* left range only if it had at least one
/// basepair overlap with *any* right genomic range.
///
/// This is a type of *filtering join*, in particular a *semi-join*.
/// See Hadley Wickham's [R for Data
/// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information.
pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>(
self,
// right: &GRanges<COITrees<M>, DR>,
Expand Down Expand Up @@ -1319,19 +1354,42 @@ impl<CL, U> GRanges<CL, Vec<U>>
where
CL: IterableRangeContainer,
{
/// Filter out ranges that do *not* have at least overlap with the `right` ranges.
/// Retain only genomic ranges that have at least one overlap with the `right`
/// set of genomic ranges. The whole range will be retained.
///
/// In database lingo, this is a type of *filtering join*, in particular a *semi join*.
/// See Hadley Wickham's excellent [R for Data
/// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information.
/// This will retain the *entire* left range only if it had at least one
/// basepair overlap with *any* right genomic range.
///
/// Note that this consumes the `self` [`GRanges`] object, turning it into a new
/// [`GRanges<VecRangesIndexed, Vec<U>>`]. The data container is rebuilt from indices
/// into a new [`Vec<U>`] where `U` is the associated type [`IndexedDataContainer::Item`],
/// which represents the individual data element in the data container.
/// This is a type of *filtering join*, in particular a *semi-join*.
/// See Hadley Wickham's [R for Data
/// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information.
pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>(
self,
right: &'a impl AsGRangesRef<'a, COITrees<M>, DR>,
) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError> {
self._filter_overlaps_base(right, false)
}

/// Exclude genomic ranges in this object that have any overlaps
/// with the `right` set of genomic ranges.
///
/// This will exclude the *entire* left range, if it had *any*
/// overlaps with *any* right genomic ranges.
///
/// This is a type of *filtering join*, in particular a *anti-join*.
/// See Hadley Wickham's [R for Data Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information.
pub fn antifilter_overlaps<'a, M: Clone + 'a, DR: 'a>(
self,
right: &'a impl AsGRangesRef<'a, COITrees<M>, DR>,
) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError> {
self._filter_overlaps_base(right, true)
}

// internal base function for handling the cases above
fn _filter_overlaps_base<'a, M: Clone + 'a, DR: 'a>(
mut self,
right: &'a impl AsGRangesRef<'a, COITrees<M>, DR>,
anti: bool,
) -> Result<GRanges<VecRangesIndexed, Vec<U>>, GRangesError> {
let mut gr: GRanges<VecRangesIndexed, Vec<U>> = GRanges::new_vec(&self.seqlens());

Expand All @@ -1345,11 +1403,11 @@ where
for (seqname, left_ranges) in self.ranges.iter() {
for left_range in left_ranges.iter_ranges() {
if let Some(right_ranges) = right_ref.ranges.get(seqname) {
let num_overlaps =
right_ranges.count_overlaps(left_range.start(), left_range.end());
if num_overlaps == 0 {
// no overlaps -- skip
} else {
let has_overlaps =
right_ranges.count_overlaps(left_range.start(), left_range.end()) > 0;
// XOR with anti
let passes_filter = has_overlaps != anti;
if passes_filter {
gr.push_range_with_index(
seqname,
left_range.start(),
Expand All @@ -1361,6 +1419,19 @@ where
new_indices.push(current_index);
current_index += 1;
}
} else if anti {
// if this left range's chrom doesn't exist in right, it doesn't have
// overlaps, so we push
gr.push_range_with_index(
seqname,
left_range.start(),
left_range.end(),
current_index,
)?;
// unwrap should be safe, since this is an indexed GRanges
old_indices.insert(left_range.index().unwrap());
new_indices.push(current_index);
current_index += 1;
}
}
}
Expand Down Expand Up @@ -1511,6 +1582,52 @@ mod tests {
assert_eq!(gr_filtered.len(), 3);
}

#[test]
fn granges_antifilter_overlaps() {
// ANTI version of above
let seqlens = seqlens! { "chr1" => 10};

// test 1: one range that overlaps only the first range
let gr = granges_test_case_01();
let mut gr_keep: GRangesEmpty<VecRangesEmpty> = GRangesEmpty::new_vec(&seqlens);
gr_keep.push_range("chr1", 0, 1).unwrap();
let gr_keep = gr_keep.into_coitrees().unwrap();

let gr_filtered = gr.antifilter_overlaps(&gr_keep).unwrap();
assert_eq!(gr_filtered.len(), 4);

// test 2: one range that overlaps the first two ranges
let gr = granges_test_case_01();
let mut gr_keep: GRangesEmpty<VecRangesEmpty> = GRangesEmpty::new_vec(&seqlens);
gr_keep.push_range("chr1", 0, 5).unwrap();
let gr_keep = gr_keep.into_coitrees().unwrap();

let gr_filtered = gr.antifilter_overlaps(&gr_keep).unwrap();
assert_eq!(gr_filtered.len(), 3);

// test 3: one range that overlaps no ranges
let gr = granges_test_case_01();
let mut gr_keep: GRangesEmpty<VecRangesEmpty> = GRangesEmpty::new_vec(&seqlens);
gr_keep.push_range("chr1", 8, 9).unwrap();
let gr_keep = gr_keep.into_coitrees().unwrap();

let gr_filtered = gr.clone().antifilter_overlaps(&gr_keep).unwrap();
assert_eq!(gr_filtered.len(), 5);
assert_eq!(gr, gr_filtered);

// test 4: ranges on two chromosomes: first overlaps two ranges, second
// overlaps one
let gr = granges_test_case_01();
let seqlens = seqlens! { "chr1" => 10, "chr2" => 10 };
let mut gr_keep: GRangesEmpty<VecRangesEmpty> = GRangesEmpty::new_vec(&seqlens);
gr_keep.push_range("chr1", 4, 7).unwrap();
gr_keep.push_range("chr2", 10, 12).unwrap();
let gr_keep = gr_keep.into_coitrees().unwrap();

let gr_filtered = gr.antifilter_overlaps(&gr_keep).unwrap();
assert_eq!(gr_filtered.len(), 2);
}

#[test]
fn test_flanking_left() {
let gr = granges_test_case_02();
Expand Down
4 changes: 1 addition & 3 deletions tests/bedtools_validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
use granges::{
commands::granges_random_bed,
io::{parsers::bed::bed_missing, InputStream},
prelude::{
read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator,
},
prelude::{read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator},
ranges::GenomicRangeRecord,
test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile},
};
Expand Down

0 comments on commit 0863d3f

Please sign in to comment.